Improve document processing in conversion.

* Add flags for long and short documents.
* Break out common length logic from plugins.
* Cleaning up of related code.
This commit is contained in:
Viktor Lofgren 2023-03-28 16:38:00 +02:00
parent 1e65ac3940
commit 03bd892b95
7 changed files with 79 additions and 51 deletions

View File

@ -7,8 +7,8 @@ public enum DocumentFlags {
PlainText, PlainText,
Ads, Ads,
Tracking, Tracking,
UnusedBit4, ShortDocument,
UnusedBit5, LongDocument,
UnusedBit6, UnusedBit6,
UnusedBit7, UnusedBit7,
; ;

View File

@ -0,0 +1,36 @@
package nu.marginalia.converting.processor.logic;
import com.google.inject.Inject;
import com.google.inject.Singleton;
import com.google.inject.name.Named;
import nu.marginalia.converting.model.DisqualifiedException;
import nu.marginalia.language.model.DocumentLanguageData;
import nu.marginalia.model.idx.DocumentFlags;
import java.util.EnumSet;
@Singleton
public class DocumentLengthLogic {
private final int minDocumentLength;
private final int shortDocumentLength = 2500;
private final int longDocumentLength = 7500;
@Inject
public DocumentLengthLogic(@Named("min-document-length") Integer minDocumentLength) {
this.minDocumentLength = minDocumentLength;
}
public void setLengthFlags(int lengthTextInChars, EnumSet<DocumentFlags> flags) {
if (lengthTextInChars < shortDocumentLength)
flags.add(DocumentFlags.ShortDocument);
else if (lengthTextInChars > longDocumentLength)
flags.add(DocumentFlags.LongDocument);
}
public void validateLength(DocumentLanguageData dld) throws DisqualifiedException {
if (dld.totalNumWords() < minDocumentLength) {
throw new DisqualifiedException(DisqualifiedException.DisqualificationReason.LENGTH);
}
}
}

View File

@ -11,19 +11,7 @@ import java.util.Set;
public class DocumentValuator { public class DocumentValuator {
private static final Set<String> filthTable = Set.of( public double getQuality(CrawledDocument crawledDocument, HtmlStandard htmlStandard, Document parsedDocument) throws DisqualifiedException {
"xxx", "sex", "anal", "sexy",
"bdsm", "fetish", "porn", "camgirls", "dildo",
"gangbang", "buttplug", "orgasm", "vibrator",
"cameltoe", "download", "iso", "botox", "torrent",
"jackpot", "vegas", "casino", "coinbase", "poloniex",
"myetherwallet", "ethereum", "binance", "bitcoin",
"litecoin", "seo", "serp"
);
public double getQuality(CrawledDocument crawledDocument, HtmlStandard htmlStandard, Document parsedDocument, DocumentLanguageData dld) throws DisqualifiedException {
double smutCoefficient = dld.streamLowerCase().filter(filthTable::contains).count();
double scriptPenalty = getScriptPenalty(parsedDocument); double scriptPenalty = getScriptPenalty(parsedDocument);
int textBodyLength = parsedDocument.text().length(); int textBodyLength = parsedDocument.text().length();
@ -35,8 +23,7 @@ public class DocumentValuator {
return Math.log(textBodyLength / (double) (1+rawLength))*htmlStandard.scale return Math.log(textBodyLength / (double) (1+rawLength))*htmlStandard.scale
+ htmlStandard.offset + htmlStandard.offset
- scriptPenalty - scriptPenalty;
- smutCoefficient;
} }

View File

@ -13,7 +13,6 @@ public class TitleExtractor {
@Inject @Inject
public TitleExtractor(@Named("max-title-length") Integer maxTitleLength) { public TitleExtractor(@Named("max-title-length") Integer maxTitleLength) {
this.maxTitleLength = maxTitleLength; this.maxTitleLength = maxTitleLength;
} }
public String getTitleAbbreviated(Document doc, DocumentLanguageData dld, String url) { public String getTitleAbbreviated(Document doc, DocumentLanguageData dld, String url) {

View File

@ -32,7 +32,7 @@ public abstract class AbstractDocumentProcessorPlugin {
protected static class MetaTagsBuilder { protected static class MetaTagsBuilder {
private final Set<String> tagWords = new HashSet<>(); private final Set<String> tagWords = new HashSet<>();
public Set<String> build(DocumentKeywordsBuilder dest) { public Set<String> build() {
return tagWords; return tagWords;
} }

View File

@ -5,6 +5,7 @@ import com.google.inject.name.Named;
import nu.marginalia.converting.processor.MetaRobotsTag; import nu.marginalia.converting.processor.MetaRobotsTag;
import nu.marginalia.converting.processor.logic.dom.DomPruningFilter; import nu.marginalia.converting.processor.logic.dom.DomPruningFilter;
import nu.marginalia.converting.processor.logic.links.LinkProcessor; import nu.marginalia.converting.processor.logic.links.LinkProcessor;
import nu.marginalia.language.model.DocumentLanguageData;
import nu.marginalia.model.crawl.HtmlFeature; import nu.marginalia.model.crawl.HtmlFeature;
import nu.marginalia.summary.SummaryExtractor; import nu.marginalia.summary.SummaryExtractor;
import nu.marginalia.link_parser.LinkParser; import nu.marginalia.link_parser.LinkParser;
@ -16,7 +17,6 @@ import nu.marginalia.converting.model.HtmlStandard;
import nu.marginalia.model.idx.DocumentFlags; import nu.marginalia.model.idx.DocumentFlags;
import nu.marginalia.keyword.model.DocumentKeywordsBuilder; import nu.marginalia.keyword.model.DocumentKeywordsBuilder;
import nu.marginalia.model.idx.DocumentMetadata; import nu.marginalia.model.idx.DocumentMetadata;
import nu.marginalia.language.model.DocumentLanguageData;
import nu.marginalia.converting.processor.logic.*; import nu.marginalia.converting.processor.logic.*;
import nu.marginalia.model.crawl.PubDate; import nu.marginalia.model.crawl.PubDate;
import nu.marginalia.gregex.GuardedRegex; import nu.marginalia.gregex.GuardedRegex;
@ -40,7 +40,6 @@ import static nu.marginalia.converting.model.DisqualifiedException.*;
public class HtmlDocumentProcessorPlugin extends AbstractDocumentProcessorPlugin { public class HtmlDocumentProcessorPlugin extends AbstractDocumentProcessorPlugin {
private final int minDocumentLength;
private final double minDocumentQuality; private final double minDocumentQuality;
private final SentenceExtractor sentenceExtractor; private final SentenceExtractor sentenceExtractor;
@ -50,6 +49,8 @@ public class HtmlDocumentProcessorPlugin extends AbstractDocumentProcessorPlugin
private final SummaryExtractor summaryExtractor; private final SummaryExtractor summaryExtractor;
private final PubDateSniffer pubDateSniffer; private final PubDateSniffer pubDateSniffer;
private final DocumentLengthLogic documentLengthLogic;
private final MetaRobotsTag metaRobotsTag; private final MetaRobotsTag metaRobotsTag;
private static final DocumentValuator documentValuator = new DocumentValuator(); private static final DocumentValuator documentValuator = new DocumentValuator();
@ -57,16 +58,17 @@ public class HtmlDocumentProcessorPlugin extends AbstractDocumentProcessorPlugin
private static final FeedExtractor feedExtractor = new FeedExtractor(linkParser); private static final FeedExtractor feedExtractor = new FeedExtractor(linkParser);
@Inject @Inject
public HtmlDocumentProcessorPlugin(@Named("min-document-length") Integer minDocumentLength, public HtmlDocumentProcessorPlugin(
@Named("min-document-quality") Double minDocumentQuality, @Named("min-document-quality") Double minDocumentQuality,
SentenceExtractor sentenceExtractor, SentenceExtractor sentenceExtractor,
FeatureExtractor featureExtractor, FeatureExtractor featureExtractor,
TitleExtractor titleExtractor, TitleExtractor titleExtractor,
DocumentKeywordExtractor keywordExtractor, DocumentKeywordExtractor keywordExtractor,
SummaryExtractor summaryExtractor, SummaryExtractor summaryExtractor,
PubDateSniffer pubDateSniffer, PubDateSniffer pubDateSniffer,
MetaRobotsTag metaRobotsTag) { DocumentLengthLogic documentLengthLogic,
this.minDocumentLength = minDocumentLength; MetaRobotsTag metaRobotsTag) {
this.documentLengthLogic = documentLengthLogic;
this.minDocumentQuality = minDocumentQuality; this.minDocumentQuality = minDocumentQuality;
this.sentenceExtractor = sentenceExtractor; this.sentenceExtractor = sentenceExtractor;
this.featureExtractor = featureExtractor; this.featureExtractor = featureExtractor;
@ -102,9 +104,7 @@ public class HtmlDocumentProcessorPlugin extends AbstractDocumentProcessorPlugin
final EdgeUrl url = new EdgeUrl(crawledDocument.url); final EdgeUrl url = new EdgeUrl(crawledDocument.url);
Document prunedDoc = prune(doc); DocumentLanguageData dld = sentenceExtractor.extractSentences(prune(doc));
var dld = sentenceExtractor.extractSentences(prunedDoc);
checkDocumentLanguage(dld); checkDocumentLanguage(dld);
@ -113,11 +113,12 @@ public class HtmlDocumentProcessorPlugin extends AbstractDocumentProcessorPlugin
ret.length = getLength(doc); ret.length = getLength(doc);
ret.standard = getHtmlStandard(doc); ret.standard = getHtmlStandard(doc);
ret.title = titleExtractor.getTitleAbbreviated(doc, dld, crawledDocument.url); ret.title = titleExtractor.getTitleAbbreviated(doc, dld, crawledDocument.url);
ret.quality = documentValuator.getQuality(crawledDocument, ret.standard, doc, dld); ret.quality = documentValuator.getQuality(crawledDocument, ret.standard, doc);
// don't move this up! it uses title and quality // don't move this up! it uses title and quality
// and is run before the heavy computations below // and is run before the heavy computations below
if (isDisqualified(url, dld, ret)) { documentLengthLogic.validateLength(dld);
if (isDisqualified(url, ret)) {
throw new DisqualifiedException(DisqualificationReason.QUALITY); throw new DisqualifiedException(DisqualificationReason.QUALITY);
} }
@ -128,6 +129,8 @@ public class HtmlDocumentProcessorPlugin extends AbstractDocumentProcessorPlugin
PubDate pubDate = pubDateSniffer.getPubDate(crawledDocument.headers, url, doc, ret.standard, true); PubDate pubDate = pubDateSniffer.getPubDate(crawledDocument.headers, url, doc, ret.standard, true);
EnumSet<DocumentFlags> documentFlags = htmlFeatures2DocumentFlags(ret.features); EnumSet<DocumentFlags> documentFlags = htmlFeatures2DocumentFlags(ret.features);
documentLengthLogic.setLengthFlags(ret.length, documentFlags);
ret.metadata = new DocumentMetadata(url.depth(), pubDate.yearByte(), 0, (int) -ret.quality, documentFlags); ret.metadata = new DocumentMetadata(url.depth(), pubDate.yearByte(), 0, (int) -ret.quality, documentFlags);
DocumentKeywordsBuilder words = keywordExtractor.extractKeywords(dld, url); DocumentKeywordsBuilder words = keywordExtractor.extractKeywords(dld, url);
@ -138,7 +141,7 @@ public class HtmlDocumentProcessorPlugin extends AbstractDocumentProcessorPlugin
.addUrl(url) .addUrl(url)
.addFeatures(ret.features) .addFeatures(ret.features)
.addFormat(ret.standard) .addFormat(ret.standard)
.build(words); .build();
words.addAllSyntheticTerms(tagWords); words.addAllSyntheticTerms(tagWords);
@ -179,13 +182,11 @@ public class HtmlDocumentProcessorPlugin extends AbstractDocumentProcessorPlugin
private static final GuardedRegex mastodonFeedRegex = GuardedRegexFactory.startsWith("/@", "^/@[^/]+/?$"); private static final GuardedRegex mastodonFeedRegex = GuardedRegexFactory.startsWith("/@", "^/@[^/]+/?$");
private boolean isDisqualified(EdgeUrl url, DocumentLanguageData dld, ProcessedDocumentDetails ret) { private boolean isDisqualified(EdgeUrl url, ProcessedDocumentDetails ret) {
if (ret.quality < minDocumentQuality) { if (ret.quality < minDocumentQuality) {
return true; return true;
} }
if (dld.totalNumWords() < minDocumentLength) {
return true;
}
// These pages shouldn't be publicly accessible // These pages shouldn't be publicly accessible
if ("phpinfo()".equals(ret.title)) { if ("phpinfo()".equals(ret.title)) {
return true; return true;

View File

@ -2,6 +2,7 @@ package nu.marginalia.converting.processor.plugin;
import com.google.inject.Inject; import com.google.inject.Inject;
import com.google.inject.name.Named; import com.google.inject.name.Named;
import nu.marginalia.converting.processor.logic.DocumentLengthLogic;
import nu.marginalia.crawling.model.CrawledDocument; import nu.marginalia.crawling.model.CrawledDocument;
import nu.marginalia.crawling.model.CrawledDomain; import nu.marginalia.crawling.model.CrawledDomain;
import nu.marginalia.keyword.DocumentKeywordExtractor; import nu.marginalia.keyword.DocumentKeywordExtractor;
@ -28,20 +29,21 @@ import java.util.List;
public class PlainTextDocumentProcessorPlugin extends AbstractDocumentProcessorPlugin { public class PlainTextDocumentProcessorPlugin extends AbstractDocumentProcessorPlugin {
private final int minDocumentLength;
private final int maxTitleLength; private final int maxTitleLength;
private final SentenceExtractor sentenceExtractor; private final SentenceExtractor sentenceExtractor;
private final DocumentKeywordExtractor keywordExtractor; private final DocumentKeywordExtractor keywordExtractor;
private final PlainTextLogic plainTextLogic = new PlainTextLogic(); private final PlainTextLogic plainTextLogic = new PlainTextLogic();
private final DocumentLengthLogic documentLengthLogic;
@Inject @Inject
public PlainTextDocumentProcessorPlugin(@Named("min-document-length") Integer minDocumentLength, public PlainTextDocumentProcessorPlugin(@Named("max-title-length") Integer maxTitleLength,
@Named("max-title-length") Integer maxTitleLength,
SentenceExtractor sentenceExtractor, SentenceExtractor sentenceExtractor,
DocumentKeywordExtractor keywordExtractor) DocumentKeywordExtractor keywordExtractor,
DocumentLengthLogic documentLengthLogic
)
{ {
this.minDocumentLength = minDocumentLength; this.documentLengthLogic = documentLengthLogic;
this.maxTitleLength = maxTitleLength; this.maxTitleLength = maxTitleLength;
this.sentenceExtractor = sentenceExtractor; this.sentenceExtractor = sentenceExtractor;
this.keywordExtractor = keywordExtractor; this.keywordExtractor = keywordExtractor;
@ -68,15 +70,14 @@ public class PlainTextDocumentProcessorPlugin extends AbstractDocumentProcessorP
checkDocumentLanguage(dld); checkDocumentLanguage(dld);
if (dld.totalNumWords() < minDocumentLength) { documentLengthLogic.validateLength(dld);
throw new DisqualifiedException(DisqualifiedException.DisqualificationReason.LENGTH);
}
var ret = new ProcessedDocumentDetails(); var ret = new ProcessedDocumentDetails();
List<String> firstFewLines = LineUtils.firstNLines(documentBody, 40); List<String> firstFewLines = LineUtils.firstNLines(documentBody, 40);
ret.length = documentBody.length(); ret.length = documentBody.length();
ret.standard = HtmlStandard.PLAIN; ret.standard = HtmlStandard.PLAIN;
ret.title = StringUtils.truncate(plainTextLogic.getTitle(url, firstFewLines), maxTitleLength); ret.title = StringUtils.truncate(plainTextLogic.getTitle(url, firstFewLines), maxTitleLength);
@ -88,7 +89,11 @@ public class PlainTextDocumentProcessorPlugin extends AbstractDocumentProcessorP
final PubDate pubDate = new PubDate(LocalDate.ofYearDay(1993, 1)); final PubDate pubDate = new PubDate(LocalDate.ofYearDay(1993, 1));
ret.metadata = new DocumentMetadata(url.depth(), pubDate.yearByte(), 0, (int) -ret.quality, EnumSet.of(DocumentFlags.PlainText)); EnumSet<DocumentFlags> documentFlags = EnumSet.of(DocumentFlags.PlainText);
documentLengthLogic.setLengthFlags(ret.length, documentFlags);
ret.metadata = new DocumentMetadata(url.depth(), pubDate.yearByte(), 0, (int) -ret.quality, documentFlags);
DocumentKeywordsBuilder words = keywordExtractor.extractKeywords(dld, url); DocumentKeywordsBuilder words = keywordExtractor.extractKeywords(dld, url);
@ -98,7 +103,7 @@ public class PlainTextDocumentProcessorPlugin extends AbstractDocumentProcessorP
.addUrl(url) .addUrl(url)
.addFeatures(ret.features) .addFeatures(ret.features)
.addFormat(ret.standard) .addFormat(ret.standard)
.build(words); .build();
words.addAllSyntheticTerms(tagWords); words.addAllSyntheticTerms(tagWords);