mirror of
https://github.com/MarginaliaSearch/MarginaliaSearch.git
synced 2025-02-23 13:09:00 +00:00
Improve document processing in conversion.
* Add flags for long and short documents. * Break out common length logic from plugins. * Cleaning up of related code.
This commit is contained in:
parent
1e65ac3940
commit
03bd892b95
@ -7,8 +7,8 @@ public enum DocumentFlags {
|
|||||||
PlainText,
|
PlainText,
|
||||||
Ads,
|
Ads,
|
||||||
Tracking,
|
Tracking,
|
||||||
UnusedBit4,
|
ShortDocument,
|
||||||
UnusedBit5,
|
LongDocument,
|
||||||
UnusedBit6,
|
UnusedBit6,
|
||||||
UnusedBit7,
|
UnusedBit7,
|
||||||
;
|
;
|
||||||
|
@ -0,0 +1,36 @@
|
|||||||
|
package nu.marginalia.converting.processor.logic;
|
||||||
|
|
||||||
|
import com.google.inject.Inject;
|
||||||
|
import com.google.inject.Singleton;
|
||||||
|
import com.google.inject.name.Named;
|
||||||
|
import nu.marginalia.converting.model.DisqualifiedException;
|
||||||
|
import nu.marginalia.language.model.DocumentLanguageData;
|
||||||
|
import nu.marginalia.model.idx.DocumentFlags;
|
||||||
|
|
||||||
|
import java.util.EnumSet;
|
||||||
|
|
||||||
|
@Singleton
|
||||||
|
public class DocumentLengthLogic {
|
||||||
|
private final int minDocumentLength;
|
||||||
|
private final int shortDocumentLength = 2500;
|
||||||
|
private final int longDocumentLength = 7500;
|
||||||
|
|
||||||
|
@Inject
|
||||||
|
public DocumentLengthLogic(@Named("min-document-length") Integer minDocumentLength) {
|
||||||
|
this.minDocumentLength = minDocumentLength;
|
||||||
|
}
|
||||||
|
|
||||||
|
public void setLengthFlags(int lengthTextInChars, EnumSet<DocumentFlags> flags) {
|
||||||
|
if (lengthTextInChars < shortDocumentLength)
|
||||||
|
flags.add(DocumentFlags.ShortDocument);
|
||||||
|
else if (lengthTextInChars > longDocumentLength)
|
||||||
|
flags.add(DocumentFlags.LongDocument);
|
||||||
|
}
|
||||||
|
|
||||||
|
public void validateLength(DocumentLanguageData dld) throws DisqualifiedException {
|
||||||
|
if (dld.totalNumWords() < minDocumentLength) {
|
||||||
|
throw new DisqualifiedException(DisqualifiedException.DisqualificationReason.LENGTH);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
}
|
@ -11,19 +11,7 @@ import java.util.Set;
|
|||||||
|
|
||||||
public class DocumentValuator {
|
public class DocumentValuator {
|
||||||
|
|
||||||
private static final Set<String> filthTable = Set.of(
|
public double getQuality(CrawledDocument crawledDocument, HtmlStandard htmlStandard, Document parsedDocument) throws DisqualifiedException {
|
||||||
"xxx", "sex", "anal", "sexy",
|
|
||||||
"bdsm", "fetish", "porn", "camgirls", "dildo",
|
|
||||||
"gangbang", "buttplug", "orgasm", "vibrator",
|
|
||||||
"cameltoe", "download", "iso", "botox", "torrent",
|
|
||||||
"jackpot", "vegas", "casino", "coinbase", "poloniex",
|
|
||||||
"myetherwallet", "ethereum", "binance", "bitcoin",
|
|
||||||
"litecoin", "seo", "serp"
|
|
||||||
|
|
||||||
);
|
|
||||||
|
|
||||||
public double getQuality(CrawledDocument crawledDocument, HtmlStandard htmlStandard, Document parsedDocument, DocumentLanguageData dld) throws DisqualifiedException {
|
|
||||||
double smutCoefficient = dld.streamLowerCase().filter(filthTable::contains).count();
|
|
||||||
double scriptPenalty = getScriptPenalty(parsedDocument);
|
double scriptPenalty = getScriptPenalty(parsedDocument);
|
||||||
|
|
||||||
int textBodyLength = parsedDocument.text().length();
|
int textBodyLength = parsedDocument.text().length();
|
||||||
@ -35,8 +23,7 @@ public class DocumentValuator {
|
|||||||
|
|
||||||
return Math.log(textBodyLength / (double) (1+rawLength))*htmlStandard.scale
|
return Math.log(textBodyLength / (double) (1+rawLength))*htmlStandard.scale
|
||||||
+ htmlStandard.offset
|
+ htmlStandard.offset
|
||||||
- scriptPenalty
|
- scriptPenalty;
|
||||||
- smutCoefficient;
|
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
|
@ -13,7 +13,6 @@ public class TitleExtractor {
|
|||||||
@Inject
|
@Inject
|
||||||
public TitleExtractor(@Named("max-title-length") Integer maxTitleLength) {
|
public TitleExtractor(@Named("max-title-length") Integer maxTitleLength) {
|
||||||
this.maxTitleLength = maxTitleLength;
|
this.maxTitleLength = maxTitleLength;
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|
||||||
public String getTitleAbbreviated(Document doc, DocumentLanguageData dld, String url) {
|
public String getTitleAbbreviated(Document doc, DocumentLanguageData dld, String url) {
|
||||||
|
@ -32,7 +32,7 @@ public abstract class AbstractDocumentProcessorPlugin {
|
|||||||
protected static class MetaTagsBuilder {
|
protected static class MetaTagsBuilder {
|
||||||
private final Set<String> tagWords = new HashSet<>();
|
private final Set<String> tagWords = new HashSet<>();
|
||||||
|
|
||||||
public Set<String> build(DocumentKeywordsBuilder dest) {
|
public Set<String> build() {
|
||||||
return tagWords;
|
return tagWords;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -5,6 +5,7 @@ import com.google.inject.name.Named;
|
|||||||
import nu.marginalia.converting.processor.MetaRobotsTag;
|
import nu.marginalia.converting.processor.MetaRobotsTag;
|
||||||
import nu.marginalia.converting.processor.logic.dom.DomPruningFilter;
|
import nu.marginalia.converting.processor.logic.dom.DomPruningFilter;
|
||||||
import nu.marginalia.converting.processor.logic.links.LinkProcessor;
|
import nu.marginalia.converting.processor.logic.links.LinkProcessor;
|
||||||
|
import nu.marginalia.language.model.DocumentLanguageData;
|
||||||
import nu.marginalia.model.crawl.HtmlFeature;
|
import nu.marginalia.model.crawl.HtmlFeature;
|
||||||
import nu.marginalia.summary.SummaryExtractor;
|
import nu.marginalia.summary.SummaryExtractor;
|
||||||
import nu.marginalia.link_parser.LinkParser;
|
import nu.marginalia.link_parser.LinkParser;
|
||||||
@ -16,7 +17,6 @@ import nu.marginalia.converting.model.HtmlStandard;
|
|||||||
import nu.marginalia.model.idx.DocumentFlags;
|
import nu.marginalia.model.idx.DocumentFlags;
|
||||||
import nu.marginalia.keyword.model.DocumentKeywordsBuilder;
|
import nu.marginalia.keyword.model.DocumentKeywordsBuilder;
|
||||||
import nu.marginalia.model.idx.DocumentMetadata;
|
import nu.marginalia.model.idx.DocumentMetadata;
|
||||||
import nu.marginalia.language.model.DocumentLanguageData;
|
|
||||||
import nu.marginalia.converting.processor.logic.*;
|
import nu.marginalia.converting.processor.logic.*;
|
||||||
import nu.marginalia.model.crawl.PubDate;
|
import nu.marginalia.model.crawl.PubDate;
|
||||||
import nu.marginalia.gregex.GuardedRegex;
|
import nu.marginalia.gregex.GuardedRegex;
|
||||||
@ -40,7 +40,6 @@ import static nu.marginalia.converting.model.DisqualifiedException.*;
|
|||||||
|
|
||||||
public class HtmlDocumentProcessorPlugin extends AbstractDocumentProcessorPlugin {
|
public class HtmlDocumentProcessorPlugin extends AbstractDocumentProcessorPlugin {
|
||||||
|
|
||||||
private final int minDocumentLength;
|
|
||||||
private final double minDocumentQuality;
|
private final double minDocumentQuality;
|
||||||
|
|
||||||
private final SentenceExtractor sentenceExtractor;
|
private final SentenceExtractor sentenceExtractor;
|
||||||
@ -50,6 +49,8 @@ public class HtmlDocumentProcessorPlugin extends AbstractDocumentProcessorPlugin
|
|||||||
private final SummaryExtractor summaryExtractor;
|
private final SummaryExtractor summaryExtractor;
|
||||||
private final PubDateSniffer pubDateSniffer;
|
private final PubDateSniffer pubDateSniffer;
|
||||||
|
|
||||||
|
private final DocumentLengthLogic documentLengthLogic;
|
||||||
|
|
||||||
private final MetaRobotsTag metaRobotsTag;
|
private final MetaRobotsTag metaRobotsTag;
|
||||||
private static final DocumentValuator documentValuator = new DocumentValuator();
|
private static final DocumentValuator documentValuator = new DocumentValuator();
|
||||||
|
|
||||||
@ -57,16 +58,17 @@ public class HtmlDocumentProcessorPlugin extends AbstractDocumentProcessorPlugin
|
|||||||
private static final FeedExtractor feedExtractor = new FeedExtractor(linkParser);
|
private static final FeedExtractor feedExtractor = new FeedExtractor(linkParser);
|
||||||
|
|
||||||
@Inject
|
@Inject
|
||||||
public HtmlDocumentProcessorPlugin(@Named("min-document-length") Integer minDocumentLength,
|
public HtmlDocumentProcessorPlugin(
|
||||||
@Named("min-document-quality") Double minDocumentQuality,
|
@Named("min-document-quality") Double minDocumentQuality,
|
||||||
SentenceExtractor sentenceExtractor,
|
SentenceExtractor sentenceExtractor,
|
||||||
FeatureExtractor featureExtractor,
|
FeatureExtractor featureExtractor,
|
||||||
TitleExtractor titleExtractor,
|
TitleExtractor titleExtractor,
|
||||||
DocumentKeywordExtractor keywordExtractor,
|
DocumentKeywordExtractor keywordExtractor,
|
||||||
SummaryExtractor summaryExtractor,
|
SummaryExtractor summaryExtractor,
|
||||||
PubDateSniffer pubDateSniffer,
|
PubDateSniffer pubDateSniffer,
|
||||||
MetaRobotsTag metaRobotsTag) {
|
DocumentLengthLogic documentLengthLogic,
|
||||||
this.minDocumentLength = minDocumentLength;
|
MetaRobotsTag metaRobotsTag) {
|
||||||
|
this.documentLengthLogic = documentLengthLogic;
|
||||||
this.minDocumentQuality = minDocumentQuality;
|
this.minDocumentQuality = minDocumentQuality;
|
||||||
this.sentenceExtractor = sentenceExtractor;
|
this.sentenceExtractor = sentenceExtractor;
|
||||||
this.featureExtractor = featureExtractor;
|
this.featureExtractor = featureExtractor;
|
||||||
@ -102,9 +104,7 @@ public class HtmlDocumentProcessorPlugin extends AbstractDocumentProcessorPlugin
|
|||||||
|
|
||||||
final EdgeUrl url = new EdgeUrl(crawledDocument.url);
|
final EdgeUrl url = new EdgeUrl(crawledDocument.url);
|
||||||
|
|
||||||
Document prunedDoc = prune(doc);
|
DocumentLanguageData dld = sentenceExtractor.extractSentences(prune(doc));
|
||||||
|
|
||||||
var dld = sentenceExtractor.extractSentences(prunedDoc);
|
|
||||||
|
|
||||||
checkDocumentLanguage(dld);
|
checkDocumentLanguage(dld);
|
||||||
|
|
||||||
@ -113,11 +113,12 @@ public class HtmlDocumentProcessorPlugin extends AbstractDocumentProcessorPlugin
|
|||||||
ret.length = getLength(doc);
|
ret.length = getLength(doc);
|
||||||
ret.standard = getHtmlStandard(doc);
|
ret.standard = getHtmlStandard(doc);
|
||||||
ret.title = titleExtractor.getTitleAbbreviated(doc, dld, crawledDocument.url);
|
ret.title = titleExtractor.getTitleAbbreviated(doc, dld, crawledDocument.url);
|
||||||
ret.quality = documentValuator.getQuality(crawledDocument, ret.standard, doc, dld);
|
ret.quality = documentValuator.getQuality(crawledDocument, ret.standard, doc);
|
||||||
|
|
||||||
// don't move this up! it uses title and quality
|
// don't move this up! it uses title and quality
|
||||||
// and is run before the heavy computations below
|
// and is run before the heavy computations below
|
||||||
if (isDisqualified(url, dld, ret)) {
|
documentLengthLogic.validateLength(dld);
|
||||||
|
if (isDisqualified(url, ret)) {
|
||||||
throw new DisqualifiedException(DisqualificationReason.QUALITY);
|
throw new DisqualifiedException(DisqualificationReason.QUALITY);
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -128,6 +129,8 @@ public class HtmlDocumentProcessorPlugin extends AbstractDocumentProcessorPlugin
|
|||||||
PubDate pubDate = pubDateSniffer.getPubDate(crawledDocument.headers, url, doc, ret.standard, true);
|
PubDate pubDate = pubDateSniffer.getPubDate(crawledDocument.headers, url, doc, ret.standard, true);
|
||||||
EnumSet<DocumentFlags> documentFlags = htmlFeatures2DocumentFlags(ret.features);
|
EnumSet<DocumentFlags> documentFlags = htmlFeatures2DocumentFlags(ret.features);
|
||||||
|
|
||||||
|
documentLengthLogic.setLengthFlags(ret.length, documentFlags);
|
||||||
|
|
||||||
ret.metadata = new DocumentMetadata(url.depth(), pubDate.yearByte(), 0, (int) -ret.quality, documentFlags);
|
ret.metadata = new DocumentMetadata(url.depth(), pubDate.yearByte(), 0, (int) -ret.quality, documentFlags);
|
||||||
|
|
||||||
DocumentKeywordsBuilder words = keywordExtractor.extractKeywords(dld, url);
|
DocumentKeywordsBuilder words = keywordExtractor.extractKeywords(dld, url);
|
||||||
@ -138,7 +141,7 @@ public class HtmlDocumentProcessorPlugin extends AbstractDocumentProcessorPlugin
|
|||||||
.addUrl(url)
|
.addUrl(url)
|
||||||
.addFeatures(ret.features)
|
.addFeatures(ret.features)
|
||||||
.addFormat(ret.standard)
|
.addFormat(ret.standard)
|
||||||
.build(words);
|
.build();
|
||||||
|
|
||||||
words.addAllSyntheticTerms(tagWords);
|
words.addAllSyntheticTerms(tagWords);
|
||||||
|
|
||||||
@ -179,13 +182,11 @@ public class HtmlDocumentProcessorPlugin extends AbstractDocumentProcessorPlugin
|
|||||||
|
|
||||||
private static final GuardedRegex mastodonFeedRegex = GuardedRegexFactory.startsWith("/@", "^/@[^/]+/?$");
|
private static final GuardedRegex mastodonFeedRegex = GuardedRegexFactory.startsWith("/@", "^/@[^/]+/?$");
|
||||||
|
|
||||||
private boolean isDisqualified(EdgeUrl url, DocumentLanguageData dld, ProcessedDocumentDetails ret) {
|
private boolean isDisqualified(EdgeUrl url, ProcessedDocumentDetails ret) {
|
||||||
if (ret.quality < minDocumentQuality) {
|
if (ret.quality < minDocumentQuality) {
|
||||||
return true;
|
return true;
|
||||||
}
|
}
|
||||||
if (dld.totalNumWords() < minDocumentLength) {
|
|
||||||
return true;
|
|
||||||
}
|
|
||||||
// These pages shouldn't be publicly accessible
|
// These pages shouldn't be publicly accessible
|
||||||
if ("phpinfo()".equals(ret.title)) {
|
if ("phpinfo()".equals(ret.title)) {
|
||||||
return true;
|
return true;
|
||||||
|
@ -2,6 +2,7 @@ package nu.marginalia.converting.processor.plugin;
|
|||||||
|
|
||||||
import com.google.inject.Inject;
|
import com.google.inject.Inject;
|
||||||
import com.google.inject.name.Named;
|
import com.google.inject.name.Named;
|
||||||
|
import nu.marginalia.converting.processor.logic.DocumentLengthLogic;
|
||||||
import nu.marginalia.crawling.model.CrawledDocument;
|
import nu.marginalia.crawling.model.CrawledDocument;
|
||||||
import nu.marginalia.crawling.model.CrawledDomain;
|
import nu.marginalia.crawling.model.CrawledDomain;
|
||||||
import nu.marginalia.keyword.DocumentKeywordExtractor;
|
import nu.marginalia.keyword.DocumentKeywordExtractor;
|
||||||
@ -28,20 +29,21 @@ import java.util.List;
|
|||||||
|
|
||||||
public class PlainTextDocumentProcessorPlugin extends AbstractDocumentProcessorPlugin {
|
public class PlainTextDocumentProcessorPlugin extends AbstractDocumentProcessorPlugin {
|
||||||
|
|
||||||
private final int minDocumentLength;
|
|
||||||
private final int maxTitleLength;
|
private final int maxTitleLength;
|
||||||
private final SentenceExtractor sentenceExtractor;
|
private final SentenceExtractor sentenceExtractor;
|
||||||
private final DocumentKeywordExtractor keywordExtractor;
|
private final DocumentKeywordExtractor keywordExtractor;
|
||||||
private final PlainTextLogic plainTextLogic = new PlainTextLogic();
|
private final PlainTextLogic plainTextLogic = new PlainTextLogic();
|
||||||
|
private final DocumentLengthLogic documentLengthLogic;
|
||||||
|
|
||||||
|
|
||||||
@Inject
|
@Inject
|
||||||
public PlainTextDocumentProcessorPlugin(@Named("min-document-length") Integer minDocumentLength,
|
public PlainTextDocumentProcessorPlugin(@Named("max-title-length") Integer maxTitleLength,
|
||||||
@Named("max-title-length") Integer maxTitleLength,
|
|
||||||
SentenceExtractor sentenceExtractor,
|
SentenceExtractor sentenceExtractor,
|
||||||
DocumentKeywordExtractor keywordExtractor)
|
DocumentKeywordExtractor keywordExtractor,
|
||||||
|
DocumentLengthLogic documentLengthLogic
|
||||||
|
)
|
||||||
{
|
{
|
||||||
this.minDocumentLength = minDocumentLength;
|
this.documentLengthLogic = documentLengthLogic;
|
||||||
this.maxTitleLength = maxTitleLength;
|
this.maxTitleLength = maxTitleLength;
|
||||||
this.sentenceExtractor = sentenceExtractor;
|
this.sentenceExtractor = sentenceExtractor;
|
||||||
this.keywordExtractor = keywordExtractor;
|
this.keywordExtractor = keywordExtractor;
|
||||||
@ -68,15 +70,14 @@ public class PlainTextDocumentProcessorPlugin extends AbstractDocumentProcessorP
|
|||||||
|
|
||||||
checkDocumentLanguage(dld);
|
checkDocumentLanguage(dld);
|
||||||
|
|
||||||
if (dld.totalNumWords() < minDocumentLength) {
|
documentLengthLogic.validateLength(dld);
|
||||||
throw new DisqualifiedException(DisqualifiedException.DisqualificationReason.LENGTH);
|
|
||||||
}
|
|
||||||
|
|
||||||
var ret = new ProcessedDocumentDetails();
|
var ret = new ProcessedDocumentDetails();
|
||||||
|
|
||||||
List<String> firstFewLines = LineUtils.firstNLines(documentBody, 40);
|
List<String> firstFewLines = LineUtils.firstNLines(documentBody, 40);
|
||||||
|
|
||||||
ret.length = documentBody.length();
|
ret.length = documentBody.length();
|
||||||
|
|
||||||
ret.standard = HtmlStandard.PLAIN;
|
ret.standard = HtmlStandard.PLAIN;
|
||||||
ret.title = StringUtils.truncate(plainTextLogic.getTitle(url, firstFewLines), maxTitleLength);
|
ret.title = StringUtils.truncate(plainTextLogic.getTitle(url, firstFewLines), maxTitleLength);
|
||||||
|
|
||||||
@ -88,7 +89,11 @@ public class PlainTextDocumentProcessorPlugin extends AbstractDocumentProcessorP
|
|||||||
|
|
||||||
final PubDate pubDate = new PubDate(LocalDate.ofYearDay(1993, 1));
|
final PubDate pubDate = new PubDate(LocalDate.ofYearDay(1993, 1));
|
||||||
|
|
||||||
ret.metadata = new DocumentMetadata(url.depth(), pubDate.yearByte(), 0, (int) -ret.quality, EnumSet.of(DocumentFlags.PlainText));
|
EnumSet<DocumentFlags> documentFlags = EnumSet.of(DocumentFlags.PlainText);
|
||||||
|
|
||||||
|
documentLengthLogic.setLengthFlags(ret.length, documentFlags);
|
||||||
|
|
||||||
|
ret.metadata = new DocumentMetadata(url.depth(), pubDate.yearByte(), 0, (int) -ret.quality, documentFlags);
|
||||||
|
|
||||||
DocumentKeywordsBuilder words = keywordExtractor.extractKeywords(dld, url);
|
DocumentKeywordsBuilder words = keywordExtractor.extractKeywords(dld, url);
|
||||||
|
|
||||||
@ -98,7 +103,7 @@ public class PlainTextDocumentProcessorPlugin extends AbstractDocumentProcessorP
|
|||||||
.addUrl(url)
|
.addUrl(url)
|
||||||
.addFeatures(ret.features)
|
.addFeatures(ret.features)
|
||||||
.addFormat(ret.standard)
|
.addFormat(ret.standard)
|
||||||
.build(words);
|
.build();
|
||||||
|
|
||||||
words.addAllSyntheticTerms(tagWords);
|
words.addAllSyntheticTerms(tagWords);
|
||||||
|
|
||||||
|
Loading…
Reference in New Issue
Block a user