New synthetic keyword for document generator meta tag.

This commit is contained in:
Viktor Lofgren 2023-06-20 16:25:49 +02:00
parent 7326ba74fe
commit a9a2960e86
3 changed files with 105 additions and 1 deletions

View File

@ -0,0 +1,89 @@
package nu.marginalia.converting.processor.logic;
import org.apache.commons.lang3.StringUtils;
import org.jsoup.nodes.Document;
import java.util.Collections;
import java.util.List;
/** Extract keywords for the document meta generator tag */
public class DocumentGeneratorExtractor {
public List<String> generatorCleaned(Document doc) {
String generator = doc
.select("meta[name=generator]")
.attr("content");
// Remove leading or trailing junk from the generator string, "powered by" etc.
generator = trim(generator);
if (generator.isBlank())
return Collections.emptyList();
String[] parts = StringUtils.split(generator, " ,:!");
if (parts.length == 0)
return Collections.emptyList();
int slashIdx = parts[0].indexOf('/');
if (slashIdx >= 0) {
// mozilla and staroffice has a really weird format
return List.of(parts[0].substring(0, slashIdx));
}
if (parts.length > 3) {
return Collections.emptyList(); // if it's still very long after trim(), it's probably a custom hand written message
}
switch (parts[0]) {
case "joomla!":
return List.of("joomla");
case "plone":
case "claris":
case "one.com":
case "wix.com":
case "wpbakery":
return List.of(parts[0]);
case "adobe":
case "microsoft":
return List.of(parts[1]);
}
if (parts.length > 1) {
return List.of(parts[0], parts[0] + "_" + truncVersion(parts[1]));
}
else {
return List.of(parts[0]);
}
}
private String trim(String generator) {
generator = generator.toLowerCase().trim();
if (generator.startsWith("powered by ")) {
generator = generator.substring("powered by ".length());
}
int dashIdx = generator.indexOf('-'); // Some strings have values like 'foobar 2.3 - the free online generator!'
if (dashIdx >= 0) {
generator = generator.substring(0, dashIdx);
}
if (!StringUtils.isAsciiPrintable(generator))
return "";
return generator;
}
// Censor exact version strings, being able to search by major version is enough
// for any non-blackhat purpose
private String truncVersion(String part) {
int periodIdx = part.indexOf('.', part.startsWith("0.") ? 2 : 0);
if (periodIdx < 0)
return part;
return part.substring(0, periodIdx);
}
}

View File

@ -59,6 +59,15 @@ public abstract class AbstractDocumentProcessorPlugin {
return this;
}
public MetaTagsBuilder addGenerator(List<String> generators) {
for (var generator : generators) {
add("generator", generator);
}
return this;
}
public MetaTagsBuilder addFormat(HtmlStandard standard) {
add("format", standard);

View File

@ -54,6 +54,7 @@ public class HtmlDocumentProcessorPlugin extends AbstractDocumentProcessorPlugin
private final DocumentLengthLogic documentLengthLogic;
private final MetaRobotsTag metaRobotsTag;
private final DocumentGeneratorExtractor documentGeneratorExtractor;
private static final DocumentValuator documentValuator = new DocumentValuator();
private static final LinkParser linkParser = new LinkParser();
@ -69,7 +70,8 @@ public class HtmlDocumentProcessorPlugin extends AbstractDocumentProcessorPlugin
SummaryExtractor summaryExtractor,
PubDateSniffer pubDateSniffer,
DocumentLengthLogic documentLengthLogic,
MetaRobotsTag metaRobotsTag) {
MetaRobotsTag metaRobotsTag,
DocumentGeneratorExtractor documentGeneratorExtractor) {
this.documentLengthLogic = documentLengthLogic;
this.minDocumentQuality = minDocumentQuality;
this.sentenceExtractor = sentenceExtractor;
@ -81,6 +83,7 @@ public class HtmlDocumentProcessorPlugin extends AbstractDocumentProcessorPlugin
this.pubDateSniffer = pubDateSniffer;
this.metaRobotsTag = metaRobotsTag;
this.documentGeneratorExtractor = documentGeneratorExtractor;
}
@Override
@ -143,12 +146,15 @@ public class HtmlDocumentProcessorPlugin extends AbstractDocumentProcessorPlugin
ret.description = getDescription(doc, words.importantWords);
List<String> generatorParts = documentGeneratorExtractor.generatorCleaned(doc);
var tagWords = new MetaTagsBuilder()
.addDomainCrawlData(crawledDomain)
.addPubDate(pubDate)
.addUrl(url)
.addFeatures(features)
.addFormat(standard)
.addGenerator(generatorParts)
.build();
words.addAllSyntheticTerms(tagWords);