Add bits and keywords for generator classes (docs, forum, wiki).

This commit is contained in:
Viktor Lofgren 2023-06-23 21:35:28 +02:00
parent 4c627d0e1d
commit bd2c3855ed
5 changed files with 58 additions and 40 deletions

View File

@ -5,10 +5,10 @@ import java.util.EnumSet;
public enum DocumentFlags { public enum DocumentFlags {
Javascript, Javascript,
PlainText, PlainText,
GeneratorSpammy, GeneratorDocs,
GeneratorVintage, GeneratorForum,
GeneratorBlog, GeneratorWiki,
GeneratorForumWiki, Unused6,
Unused7, Unused7,
Unused8, Unused8,
; ;

View File

@ -6,8 +6,9 @@ public enum GeneratorType {
ZOOMER_STATIC, ZOOMER_STATIC,
CMS, CMS,
SAAS, SAAS,
MANUAL_RETRO, MANUAL,
MANUAL_NEW, FORUM,
DOCS_FORUM_WIKI, WIKI,
DOCS,
ECOMMERCE_AND_SPAM ECOMMERCE_AND_SPAM
} }

View File

@ -4,6 +4,8 @@ import nu.marginalia.converting.model.GeneratorType;
import org.apache.commons.lang3.StringUtils; import org.apache.commons.lang3.StringUtils;
import org.jsoup.nodes.Document; import org.jsoup.nodes.Document;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.List; import java.util.List;
/** Extract keywords for the document meta generator tag */ /** Extract keywords for the document meta generator tag */
@ -13,8 +15,10 @@ public class DocumentGeneratorExtractor {
public DocumentGenerator generatorCleaned(Document doc) { public DocumentGenerator generatorCleaned(Document doc) {
var tags = doc.select("meta[name=generator]"); var tags = doc.select("meta[name=generator]");
if (tags.size() == 0) { if (tags.size() == 0) {
return DocumentGenerator.unset(); // Some sites have a comment in the head instead of a meta tag
return fingerprintByComments(doc);
} }
if (tags.size() > 1) { if (tags.size() > 1) {
return DocumentGenerator.multiple(); return DocumentGenerator.multiple();
@ -22,7 +26,7 @@ public class DocumentGeneratorExtractor {
String generator = tags.attr("content"); String generator = tags.attr("content");
// Remove leading or trailing junk from the generator string, "powered by" etc. // Remove leading or trailing junk from the generator string, "powered by" etc.
generator = trim(generator); generator = removePrefixOrSuffix(generator);
if (generator.isBlank()) if (generator.isBlank())
return DocumentGenerator.unset(); return DocumentGenerator.unset();
@ -63,11 +67,29 @@ public class DocumentGeneratorExtractor {
} }
} }
private String trim(String generator) { // Fallback logic when there is no meta tag
private DocumentGenerator fingerprintByComments(Document doc) {
for (var comment : doc.getElementsByTag("head").comments()) {
if (comment.getData().contains("Generated by javadoc")) {
return DocumentGenerator.of("javadoc");
}
}
return DocumentGenerator.unset();
}
private String removePrefixOrSuffix(String generator) {
generator = generator.toLowerCase().trim(); generator = generator.toLowerCase().trim();
if (generator.startsWith("powered by ")) {
generator = generator.substring("powered by ".length()); // strip common prefixes
for (String prefix : Arrays.asList("powered by ", "generated by ")) {
if (generator.startsWith(prefix)) {
generator = generator.substring(prefix.length());
break;
}
} }
int dashIdx = generator.indexOf('-'); // Some strings have values like 'foobar 2.3 - the free online generator!' int dashIdx = generator.indexOf('-'); // Some strings have values like 'foobar 2.3 - the free online generator!'
@ -82,7 +104,8 @@ public class DocumentGeneratorExtractor {
} }
// Censor exact version strings, being able to search by major version is enough // Censor exact version strings, being able to search by major version is enough
// for any non-blackhat purpose // for any non-blackhat purpose; creating a directory with exact version string
// is a security risk for the site owner.
private String truncVersion(String part) { private String truncVersion(String part) {
int periodIdx = part.indexOf('.', part.startsWith("0.") ? 2 : 0); int periodIdx = part.indexOf('.', part.startsWith("0.") ? 2 : 0);
@ -101,6 +124,8 @@ public class DocumentGeneratorExtractor {
if (parts.length == 0) if (parts.length == 0)
return unset(); return unset();
List<String> keywords = new ArrayList<>(List.of(parts));
final GeneratorType type = switch (parts[0]) { final GeneratorType type = switch (parts[0]) {
case "joomla", "wordpress", "drupal", "plone", "postnuke", "divi", "freeway", "unicity", case "joomla", "wordpress", "drupal", "plone", "postnuke", "divi", "freeway", "unicity",
"modx", "sitemagic", "agility", "edlio", "blogger", "slider", "slider_revolution", "gravcms", "modx", "sitemagic", "agility", "edlio", "blogger", "slider", "slider_revolution", "gravcms",
@ -121,23 +146,29 @@ public class DocumentGeneratorExtractor {
-> GeneratorType.BOOMER_STATIC; -> GeneratorType.BOOMER_STATIC;
case "hugo", "jekyll", "hakyll", "gatsby", "react", "gridsome" case "hugo", "jekyll", "hakyll", "gatsby", "react", "gridsome"
-> GeneratorType.ZOOMER_STATIC; -> GeneratorType.ZOOMER_STATIC;
case "vi", "vim", "emacs", "orgmode", "hand", "vscode", "atom", "bbedit", "nano" case "vi", "vim", "emacs", "orgmode", "hand", "vscode", "atom", "bbedit", "nano",
-> GeneratorType.MANUAL_NEW; "notepad.exe", "gedit", "me",
case "notepad.exe", "gedit", "me",
"geany", "sublime", "notepad++", "author", "geany", "sublime", "notepad++", "author",
"notepad", "namo", "arachnophilia", "scite", "notepad", "namo", "arachnophilia", "scite",
"alleycode", "htmlkit", "acehtml", "bluefish", "htmled", "cutehtml", "fileedit", "cocoa" "alleycode", "htmlkit", "acehtml", "bluefish", "htmled", "cutehtml", "fileedit", "cocoa"
-> GeneratorType.MANUAL_RETRO; -> GeneratorType.MANUAL;
case "vbulletin", "phpbb", "mybb", "nodebb", "flarum", case "vbulletin", "phpbb", "mybb", "nodebb", "flarum", "discourse"
"discourse", "mediawiki", "dokuwiki", "pandoc", "mkdocs", "sharepoint", "doxygen" -> GeneratorType.FORUM;
-> GeneratorType.DOCS_FORUM_WIKI; case "mediawiki", "dokuwiki", "sharepoint"
-> GeneratorType.WIKI;
case "pandoc", "mkdocs", "doxygen", "javadoc"
-> GeneratorType.DOCS;
case "woocommerce", "shopfactory", "prestashop", "magento", "shopify", "sitedirect", "seomatic" case "woocommerce", "shopfactory", "prestashop", "magento", "shopify", "sitedirect", "seomatic"
-> GeneratorType.ECOMMERCE_AND_SPAM; -> GeneratorType.ECOMMERCE_AND_SPAM;
default default
-> GeneratorType.UNKNOWN; -> GeneratorType.UNKNOWN;
}; };
return new DocumentGenerator(type, List.of(parts)); if (type != GeneratorType.UNKNOWN) {
keywords.add(type.name().toLowerCase());
}
return new DocumentGenerator(type, keywords);
} }
public static DocumentGenerator multiple() { public static DocumentGenerator multiple() {

View File

@ -182,10 +182,9 @@ public class HtmlDocumentProcessorPlugin extends AbstractDocumentProcessorPlugin
} }
switch (type) { switch (type) {
case ECOMMERCE_AND_SPAM -> flags.add(DocumentFlags.GeneratorSpammy); case DOCS -> flags.add(DocumentFlags.GeneratorDocs);
case DOCS_FORUM_WIKI -> flags.add(DocumentFlags.GeneratorForumWiki); case FORUM -> flags.add(DocumentFlags.GeneratorForum);
case ZOOMER_STATIC, MANUAL_NEW -> flags.add(DocumentFlags.GeneratorBlog); case WIKI -> flags.add(DocumentFlags.GeneratorWiki);
case MANUAL_RETRO, BOOMER_STATIC -> flags.add(DocumentFlags.GeneratorVintage);
default -> {} // no flags default -> {} // no flags
} }

View File

@ -64,26 +64,13 @@ public class RankingSearchSet implements SearchSet {
@Override @Override
public boolean contains(int urlId, long documentMetadata) { public boolean contains(int urlId, long documentMetadata) {
// For ranked search sets, exclude excessively commercial sites
// TODO: Maybe this particular check should be moved up to the search service and be opt-in?
if (DocumentMetadata.hasFlags(documentMetadata, DocumentFlags.GeneratorSpammy.asBit())) {
return false;
}
// This is the main check // This is the main check
if (set.contains(urlId) || set.isEmpty()) { if (set.contains(urlId) || set.isEmpty()) {
return true; return true;
} }
// TODO
// For the rest, let through some domains that are not in the set based on the generator tag return false;
if (identifier == SearchSetIdentifier.SMALLWEB) {
return DocumentMetadata.hasFlags(documentMetadata, DocumentFlags.GeneratorBlog.asBit());
}
if (identifier == SearchSetIdentifier.RETRO) {
return DocumentMetadata.hasFlags(documentMetadata, DocumentFlags.GeneratorVintage.asBit());
}
return DocumentMetadata.hasFlags(documentMetadata, DocumentFlags.GeneratorForumWiki.asBit());
} }
public void write() throws IOException { public void write() throws IOException {