mirror of
https://github.com/MarginaliaSearch/MarginaliaSearch.git
synced 2025-02-23 21:18:58 +00:00
New synthetic keyword for document generator meta tag.
This commit is contained in:
parent
7326ba74fe
commit
a9a2960e86
@ -0,0 +1,89 @@
|
||||
package nu.marginalia.converting.processor.logic;
|
||||
|
||||
import org.apache.commons.lang3.StringUtils;
|
||||
import org.jsoup.nodes.Document;
|
||||
|
||||
import java.util.Collections;
|
||||
import java.util.List;
|
||||
|
||||
/** Extract keywords for the document meta generator tag */
|
||||
public class DocumentGeneratorExtractor {
|
||||
|
||||
public List<String> generatorCleaned(Document doc) {
|
||||
|
||||
String generator = doc
|
||||
.select("meta[name=generator]")
|
||||
.attr("content");
|
||||
|
||||
// Remove leading or trailing junk from the generator string, "powered by" etc.
|
||||
generator = trim(generator);
|
||||
|
||||
if (generator.isBlank())
|
||||
return Collections.emptyList();
|
||||
|
||||
String[] parts = StringUtils.split(generator, " ,:!");
|
||||
if (parts.length == 0)
|
||||
return Collections.emptyList();
|
||||
|
||||
int slashIdx = parts[0].indexOf('/');
|
||||
if (slashIdx >= 0) {
|
||||
// mozilla and staroffice has a really weird format
|
||||
return List.of(parts[0].substring(0, slashIdx));
|
||||
}
|
||||
|
||||
if (parts.length > 3) {
|
||||
return Collections.emptyList(); // if it's still very long after trim(), it's probably a custom hand written message
|
||||
}
|
||||
|
||||
switch (parts[0]) {
|
||||
case "joomla!":
|
||||
return List.of("joomla");
|
||||
case "plone":
|
||||
case "claris":
|
||||
case "one.com":
|
||||
case "wix.com":
|
||||
case "wpbakery":
|
||||
return List.of(parts[0]);
|
||||
case "adobe":
|
||||
case "microsoft":
|
||||
return List.of(parts[1]);
|
||||
}
|
||||
|
||||
if (parts.length > 1) {
|
||||
return List.of(parts[0], parts[0] + "_" + truncVersion(parts[1]));
|
||||
}
|
||||
else {
|
||||
return List.of(parts[0]);
|
||||
}
|
||||
}
|
||||
|
||||
private String trim(String generator) {
|
||||
|
||||
generator = generator.toLowerCase().trim();
|
||||
if (generator.startsWith("powered by ")) {
|
||||
generator = generator.substring("powered by ".length());
|
||||
}
|
||||
|
||||
int dashIdx = generator.indexOf('-'); // Some strings have values like 'foobar 2.3 - the free online generator!'
|
||||
if (dashIdx >= 0) {
|
||||
generator = generator.substring(0, dashIdx);
|
||||
}
|
||||
|
||||
if (!StringUtils.isAsciiPrintable(generator))
|
||||
return "";
|
||||
|
||||
return generator;
|
||||
}
|
||||
|
||||
// Censor exact version strings, being able to search by major version is enough
|
||||
// for any non-blackhat purpose
|
||||
private String truncVersion(String part) {
|
||||
int periodIdx = part.indexOf('.', part.startsWith("0.") ? 2 : 0);
|
||||
|
||||
if (periodIdx < 0)
|
||||
return part;
|
||||
|
||||
return part.substring(0, periodIdx);
|
||||
}
|
||||
|
||||
}
|
@ -59,6 +59,15 @@ public abstract class AbstractDocumentProcessorPlugin {
|
||||
return this;
|
||||
}
|
||||
|
||||
public MetaTagsBuilder addGenerator(List<String> generators) {
|
||||
|
||||
for (var generator : generators) {
|
||||
add("generator", generator);
|
||||
}
|
||||
|
||||
return this;
|
||||
}
|
||||
|
||||
public MetaTagsBuilder addFormat(HtmlStandard standard) {
|
||||
|
||||
add("format", standard);
|
||||
|
@ -54,6 +54,7 @@ public class HtmlDocumentProcessorPlugin extends AbstractDocumentProcessorPlugin
|
||||
private final DocumentLengthLogic documentLengthLogic;
|
||||
|
||||
private final MetaRobotsTag metaRobotsTag;
|
||||
private final DocumentGeneratorExtractor documentGeneratorExtractor;
|
||||
private static final DocumentValuator documentValuator = new DocumentValuator();
|
||||
|
||||
private static final LinkParser linkParser = new LinkParser();
|
||||
@ -69,7 +70,8 @@ public class HtmlDocumentProcessorPlugin extends AbstractDocumentProcessorPlugin
|
||||
SummaryExtractor summaryExtractor,
|
||||
PubDateSniffer pubDateSniffer,
|
||||
DocumentLengthLogic documentLengthLogic,
|
||||
MetaRobotsTag metaRobotsTag) {
|
||||
MetaRobotsTag metaRobotsTag,
|
||||
DocumentGeneratorExtractor documentGeneratorExtractor) {
|
||||
this.documentLengthLogic = documentLengthLogic;
|
||||
this.minDocumentQuality = minDocumentQuality;
|
||||
this.sentenceExtractor = sentenceExtractor;
|
||||
@ -81,6 +83,7 @@ public class HtmlDocumentProcessorPlugin extends AbstractDocumentProcessorPlugin
|
||||
this.pubDateSniffer = pubDateSniffer;
|
||||
this.metaRobotsTag = metaRobotsTag;
|
||||
|
||||
this.documentGeneratorExtractor = documentGeneratorExtractor;
|
||||
}
|
||||
|
||||
@Override
|
||||
@ -143,12 +146,15 @@ public class HtmlDocumentProcessorPlugin extends AbstractDocumentProcessorPlugin
|
||||
|
||||
ret.description = getDescription(doc, words.importantWords);
|
||||
|
||||
List<String> generatorParts = documentGeneratorExtractor.generatorCleaned(doc);
|
||||
|
||||
var tagWords = new MetaTagsBuilder()
|
||||
.addDomainCrawlData(crawledDomain)
|
||||
.addPubDate(pubDate)
|
||||
.addUrl(url)
|
||||
.addFeatures(features)
|
||||
.addFormat(standard)
|
||||
.addGenerator(generatorParts)
|
||||
.build();
|
||||
|
||||
words.addAllSyntheticTerms(tagWords);
|
||||
|
Loading…
Reference in New Issue
Block a user