mirror of
https://github.com/MarginaliaSearch/MarginaliaSearch.git
synced 2025-02-24 05:18:58 +00:00
New synthetic keyword for document generator meta tag.
This commit is contained in:
parent
7326ba74fe
commit
a9a2960e86
@ -0,0 +1,89 @@
|
|||||||
|
package nu.marginalia.converting.processor.logic;
|
||||||
|
|
||||||
|
import org.apache.commons.lang3.StringUtils;
|
||||||
|
import org.jsoup.nodes.Document;
|
||||||
|
|
||||||
|
import java.util.Collections;
|
||||||
|
import java.util.List;
|
||||||
|
|
||||||
|
/** Extract keywords for the document meta generator tag */
|
||||||
|
public class DocumentGeneratorExtractor {
|
||||||
|
|
||||||
|
public List<String> generatorCleaned(Document doc) {
|
||||||
|
|
||||||
|
String generator = doc
|
||||||
|
.select("meta[name=generator]")
|
||||||
|
.attr("content");
|
||||||
|
|
||||||
|
// Remove leading or trailing junk from the generator string, "powered by" etc.
|
||||||
|
generator = trim(generator);
|
||||||
|
|
||||||
|
if (generator.isBlank())
|
||||||
|
return Collections.emptyList();
|
||||||
|
|
||||||
|
String[] parts = StringUtils.split(generator, " ,:!");
|
||||||
|
if (parts.length == 0)
|
||||||
|
return Collections.emptyList();
|
||||||
|
|
||||||
|
int slashIdx = parts[0].indexOf('/');
|
||||||
|
if (slashIdx >= 0) {
|
||||||
|
// mozilla and staroffice has a really weird format
|
||||||
|
return List.of(parts[0].substring(0, slashIdx));
|
||||||
|
}
|
||||||
|
|
||||||
|
if (parts.length > 3) {
|
||||||
|
return Collections.emptyList(); // if it's still very long after trim(), it's probably a custom hand written message
|
||||||
|
}
|
||||||
|
|
||||||
|
switch (parts[0]) {
|
||||||
|
case "joomla!":
|
||||||
|
return List.of("joomla");
|
||||||
|
case "plone":
|
||||||
|
case "claris":
|
||||||
|
case "one.com":
|
||||||
|
case "wix.com":
|
||||||
|
case "wpbakery":
|
||||||
|
return List.of(parts[0]);
|
||||||
|
case "adobe":
|
||||||
|
case "microsoft":
|
||||||
|
return List.of(parts[1]);
|
||||||
|
}
|
||||||
|
|
||||||
|
if (parts.length > 1) {
|
||||||
|
return List.of(parts[0], parts[0] + "_" + truncVersion(parts[1]));
|
||||||
|
}
|
||||||
|
else {
|
||||||
|
return List.of(parts[0]);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
private String trim(String generator) {
|
||||||
|
|
||||||
|
generator = generator.toLowerCase().trim();
|
||||||
|
if (generator.startsWith("powered by ")) {
|
||||||
|
generator = generator.substring("powered by ".length());
|
||||||
|
}
|
||||||
|
|
||||||
|
int dashIdx = generator.indexOf('-'); // Some strings have values like 'foobar 2.3 - the free online generator!'
|
||||||
|
if (dashIdx >= 0) {
|
||||||
|
generator = generator.substring(0, dashIdx);
|
||||||
|
}
|
||||||
|
|
||||||
|
if (!StringUtils.isAsciiPrintable(generator))
|
||||||
|
return "";
|
||||||
|
|
||||||
|
return generator;
|
||||||
|
}
|
||||||
|
|
||||||
|
// Censor exact version strings, being able to search by major version is enough
|
||||||
|
// for any non-blackhat purpose
|
||||||
|
private String truncVersion(String part) {
|
||||||
|
int periodIdx = part.indexOf('.', part.startsWith("0.") ? 2 : 0);
|
||||||
|
|
||||||
|
if (periodIdx < 0)
|
||||||
|
return part;
|
||||||
|
|
||||||
|
return part.substring(0, periodIdx);
|
||||||
|
}
|
||||||
|
|
||||||
|
}
|
@ -59,6 +59,15 @@ public abstract class AbstractDocumentProcessorPlugin {
|
|||||||
return this;
|
return this;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
public MetaTagsBuilder addGenerator(List<String> generators) {
|
||||||
|
|
||||||
|
for (var generator : generators) {
|
||||||
|
add("generator", generator);
|
||||||
|
}
|
||||||
|
|
||||||
|
return this;
|
||||||
|
}
|
||||||
|
|
||||||
public MetaTagsBuilder addFormat(HtmlStandard standard) {
|
public MetaTagsBuilder addFormat(HtmlStandard standard) {
|
||||||
|
|
||||||
add("format", standard);
|
add("format", standard);
|
||||||
|
@ -54,6 +54,7 @@ public class HtmlDocumentProcessorPlugin extends AbstractDocumentProcessorPlugin
|
|||||||
private final DocumentLengthLogic documentLengthLogic;
|
private final DocumentLengthLogic documentLengthLogic;
|
||||||
|
|
||||||
private final MetaRobotsTag metaRobotsTag;
|
private final MetaRobotsTag metaRobotsTag;
|
||||||
|
private final DocumentGeneratorExtractor documentGeneratorExtractor;
|
||||||
private static final DocumentValuator documentValuator = new DocumentValuator();
|
private static final DocumentValuator documentValuator = new DocumentValuator();
|
||||||
|
|
||||||
private static final LinkParser linkParser = new LinkParser();
|
private static final LinkParser linkParser = new LinkParser();
|
||||||
@ -69,7 +70,8 @@ public class HtmlDocumentProcessorPlugin extends AbstractDocumentProcessorPlugin
|
|||||||
SummaryExtractor summaryExtractor,
|
SummaryExtractor summaryExtractor,
|
||||||
PubDateSniffer pubDateSniffer,
|
PubDateSniffer pubDateSniffer,
|
||||||
DocumentLengthLogic documentLengthLogic,
|
DocumentLengthLogic documentLengthLogic,
|
||||||
MetaRobotsTag metaRobotsTag) {
|
MetaRobotsTag metaRobotsTag,
|
||||||
|
DocumentGeneratorExtractor documentGeneratorExtractor) {
|
||||||
this.documentLengthLogic = documentLengthLogic;
|
this.documentLengthLogic = documentLengthLogic;
|
||||||
this.minDocumentQuality = minDocumentQuality;
|
this.minDocumentQuality = minDocumentQuality;
|
||||||
this.sentenceExtractor = sentenceExtractor;
|
this.sentenceExtractor = sentenceExtractor;
|
||||||
@ -81,6 +83,7 @@ public class HtmlDocumentProcessorPlugin extends AbstractDocumentProcessorPlugin
|
|||||||
this.pubDateSniffer = pubDateSniffer;
|
this.pubDateSniffer = pubDateSniffer;
|
||||||
this.metaRobotsTag = metaRobotsTag;
|
this.metaRobotsTag = metaRobotsTag;
|
||||||
|
|
||||||
|
this.documentGeneratorExtractor = documentGeneratorExtractor;
|
||||||
}
|
}
|
||||||
|
|
||||||
@Override
|
@Override
|
||||||
@ -143,12 +146,15 @@ public class HtmlDocumentProcessorPlugin extends AbstractDocumentProcessorPlugin
|
|||||||
|
|
||||||
ret.description = getDescription(doc, words.importantWords);
|
ret.description = getDescription(doc, words.importantWords);
|
||||||
|
|
||||||
|
List<String> generatorParts = documentGeneratorExtractor.generatorCleaned(doc);
|
||||||
|
|
||||||
var tagWords = new MetaTagsBuilder()
|
var tagWords = new MetaTagsBuilder()
|
||||||
.addDomainCrawlData(crawledDomain)
|
.addDomainCrawlData(crawledDomain)
|
||||||
.addPubDate(pubDate)
|
.addPubDate(pubDate)
|
||||||
.addUrl(url)
|
.addUrl(url)
|
||||||
.addFeatures(features)
|
.addFeatures(features)
|
||||||
.addFormat(standard)
|
.addFormat(standard)
|
||||||
|
.addGenerator(generatorParts)
|
||||||
.build();
|
.build();
|
||||||
|
|
||||||
words.addAllSyntheticTerms(tagWords);
|
words.addAllSyntheticTerms(tagWords);
|
||||||
|
Loading…
Reference in New Issue
Block a user