mirror of
https://github.com/MarginaliaSearch/MarginaliaSearch.git
synced 2025-02-23 04:58:59 +00:00
Improve document processing in conversion.
* Add flags for long and short documents. * Break out common length logic from plugins. * Cleaning up of related code.
This commit is contained in:
parent
1e65ac3940
commit
03bd892b95
@ -7,8 +7,8 @@ public enum DocumentFlags {
|
||||
PlainText,
|
||||
Ads,
|
||||
Tracking,
|
||||
UnusedBit4,
|
||||
UnusedBit5,
|
||||
ShortDocument,
|
||||
LongDocument,
|
||||
UnusedBit6,
|
||||
UnusedBit7,
|
||||
;
|
||||
|
@ -0,0 +1,36 @@
|
||||
package nu.marginalia.converting.processor.logic;
|
||||
|
||||
import com.google.inject.Inject;
|
||||
import com.google.inject.Singleton;
|
||||
import com.google.inject.name.Named;
|
||||
import nu.marginalia.converting.model.DisqualifiedException;
|
||||
import nu.marginalia.language.model.DocumentLanguageData;
|
||||
import nu.marginalia.model.idx.DocumentFlags;
|
||||
|
||||
import java.util.EnumSet;
|
||||
|
||||
@Singleton
|
||||
public class DocumentLengthLogic {
|
||||
private final int minDocumentLength;
|
||||
private final int shortDocumentLength = 2500;
|
||||
private final int longDocumentLength = 7500;
|
||||
|
||||
@Inject
|
||||
public DocumentLengthLogic(@Named("min-document-length") Integer minDocumentLength) {
|
||||
this.minDocumentLength = minDocumentLength;
|
||||
}
|
||||
|
||||
public void setLengthFlags(int lengthTextInChars, EnumSet<DocumentFlags> flags) {
|
||||
if (lengthTextInChars < shortDocumentLength)
|
||||
flags.add(DocumentFlags.ShortDocument);
|
||||
else if (lengthTextInChars > longDocumentLength)
|
||||
flags.add(DocumentFlags.LongDocument);
|
||||
}
|
||||
|
||||
public void validateLength(DocumentLanguageData dld) throws DisqualifiedException {
|
||||
if (dld.totalNumWords() < minDocumentLength) {
|
||||
throw new DisqualifiedException(DisqualifiedException.DisqualificationReason.LENGTH);
|
||||
}
|
||||
}
|
||||
|
||||
}
|
@ -11,19 +11,7 @@ import java.util.Set;
|
||||
|
||||
public class DocumentValuator {
|
||||
|
||||
private static final Set<String> filthTable = Set.of(
|
||||
"xxx", "sex", "anal", "sexy",
|
||||
"bdsm", "fetish", "porn", "camgirls", "dildo",
|
||||
"gangbang", "buttplug", "orgasm", "vibrator",
|
||||
"cameltoe", "download", "iso", "botox", "torrent",
|
||||
"jackpot", "vegas", "casino", "coinbase", "poloniex",
|
||||
"myetherwallet", "ethereum", "binance", "bitcoin",
|
||||
"litecoin", "seo", "serp"
|
||||
|
||||
);
|
||||
|
||||
public double getQuality(CrawledDocument crawledDocument, HtmlStandard htmlStandard, Document parsedDocument, DocumentLanguageData dld) throws DisqualifiedException {
|
||||
double smutCoefficient = dld.streamLowerCase().filter(filthTable::contains).count();
|
||||
public double getQuality(CrawledDocument crawledDocument, HtmlStandard htmlStandard, Document parsedDocument) throws DisqualifiedException {
|
||||
double scriptPenalty = getScriptPenalty(parsedDocument);
|
||||
|
||||
int textBodyLength = parsedDocument.text().length();
|
||||
@ -35,8 +23,7 @@ public class DocumentValuator {
|
||||
|
||||
return Math.log(textBodyLength / (double) (1+rawLength))*htmlStandard.scale
|
||||
+ htmlStandard.offset
|
||||
- scriptPenalty
|
||||
- smutCoefficient;
|
||||
- scriptPenalty;
|
||||
}
|
||||
|
||||
|
||||
|
@ -13,7 +13,6 @@ public class TitleExtractor {
|
||||
@Inject
|
||||
public TitleExtractor(@Named("max-title-length") Integer maxTitleLength) {
|
||||
this.maxTitleLength = maxTitleLength;
|
||||
|
||||
}
|
||||
|
||||
public String getTitleAbbreviated(Document doc, DocumentLanguageData dld, String url) {
|
||||
|
@ -32,7 +32,7 @@ public abstract class AbstractDocumentProcessorPlugin {
|
||||
protected static class MetaTagsBuilder {
|
||||
private final Set<String> tagWords = new HashSet<>();
|
||||
|
||||
public Set<String> build(DocumentKeywordsBuilder dest) {
|
||||
public Set<String> build() {
|
||||
return tagWords;
|
||||
}
|
||||
|
||||
|
@ -5,6 +5,7 @@ import com.google.inject.name.Named;
|
||||
import nu.marginalia.converting.processor.MetaRobotsTag;
|
||||
import nu.marginalia.converting.processor.logic.dom.DomPruningFilter;
|
||||
import nu.marginalia.converting.processor.logic.links.LinkProcessor;
|
||||
import nu.marginalia.language.model.DocumentLanguageData;
|
||||
import nu.marginalia.model.crawl.HtmlFeature;
|
||||
import nu.marginalia.summary.SummaryExtractor;
|
||||
import nu.marginalia.link_parser.LinkParser;
|
||||
@ -16,7 +17,6 @@ import nu.marginalia.converting.model.HtmlStandard;
|
||||
import nu.marginalia.model.idx.DocumentFlags;
|
||||
import nu.marginalia.keyword.model.DocumentKeywordsBuilder;
|
||||
import nu.marginalia.model.idx.DocumentMetadata;
|
||||
import nu.marginalia.language.model.DocumentLanguageData;
|
||||
import nu.marginalia.converting.processor.logic.*;
|
||||
import nu.marginalia.model.crawl.PubDate;
|
||||
import nu.marginalia.gregex.GuardedRegex;
|
||||
@ -40,7 +40,6 @@ import static nu.marginalia.converting.model.DisqualifiedException.*;
|
||||
|
||||
public class HtmlDocumentProcessorPlugin extends AbstractDocumentProcessorPlugin {
|
||||
|
||||
private final int minDocumentLength;
|
||||
private final double minDocumentQuality;
|
||||
|
||||
private final SentenceExtractor sentenceExtractor;
|
||||
@ -50,6 +49,8 @@ public class HtmlDocumentProcessorPlugin extends AbstractDocumentProcessorPlugin
|
||||
private final SummaryExtractor summaryExtractor;
|
||||
private final PubDateSniffer pubDateSniffer;
|
||||
|
||||
private final DocumentLengthLogic documentLengthLogic;
|
||||
|
||||
private final MetaRobotsTag metaRobotsTag;
|
||||
private static final DocumentValuator documentValuator = new DocumentValuator();
|
||||
|
||||
@ -57,16 +58,17 @@ public class HtmlDocumentProcessorPlugin extends AbstractDocumentProcessorPlugin
|
||||
private static final FeedExtractor feedExtractor = new FeedExtractor(linkParser);
|
||||
|
||||
@Inject
|
||||
public HtmlDocumentProcessorPlugin(@Named("min-document-length") Integer minDocumentLength,
|
||||
@Named("min-document-quality") Double minDocumentQuality,
|
||||
SentenceExtractor sentenceExtractor,
|
||||
FeatureExtractor featureExtractor,
|
||||
TitleExtractor titleExtractor,
|
||||
DocumentKeywordExtractor keywordExtractor,
|
||||
SummaryExtractor summaryExtractor,
|
||||
PubDateSniffer pubDateSniffer,
|
||||
MetaRobotsTag metaRobotsTag) {
|
||||
this.minDocumentLength = minDocumentLength;
|
||||
public HtmlDocumentProcessorPlugin(
|
||||
@Named("min-document-quality") Double minDocumentQuality,
|
||||
SentenceExtractor sentenceExtractor,
|
||||
FeatureExtractor featureExtractor,
|
||||
TitleExtractor titleExtractor,
|
||||
DocumentKeywordExtractor keywordExtractor,
|
||||
SummaryExtractor summaryExtractor,
|
||||
PubDateSniffer pubDateSniffer,
|
||||
DocumentLengthLogic documentLengthLogic,
|
||||
MetaRobotsTag metaRobotsTag) {
|
||||
this.documentLengthLogic = documentLengthLogic;
|
||||
this.minDocumentQuality = minDocumentQuality;
|
||||
this.sentenceExtractor = sentenceExtractor;
|
||||
this.featureExtractor = featureExtractor;
|
||||
@ -102,9 +104,7 @@ public class HtmlDocumentProcessorPlugin extends AbstractDocumentProcessorPlugin
|
||||
|
||||
final EdgeUrl url = new EdgeUrl(crawledDocument.url);
|
||||
|
||||
Document prunedDoc = prune(doc);
|
||||
|
||||
var dld = sentenceExtractor.extractSentences(prunedDoc);
|
||||
DocumentLanguageData dld = sentenceExtractor.extractSentences(prune(doc));
|
||||
|
||||
checkDocumentLanguage(dld);
|
||||
|
||||
@ -113,11 +113,12 @@ public class HtmlDocumentProcessorPlugin extends AbstractDocumentProcessorPlugin
|
||||
ret.length = getLength(doc);
|
||||
ret.standard = getHtmlStandard(doc);
|
||||
ret.title = titleExtractor.getTitleAbbreviated(doc, dld, crawledDocument.url);
|
||||
ret.quality = documentValuator.getQuality(crawledDocument, ret.standard, doc, dld);
|
||||
ret.quality = documentValuator.getQuality(crawledDocument, ret.standard, doc);
|
||||
|
||||
// don't move this up! it uses title and quality
|
||||
// and is run before the heavy computations below
|
||||
if (isDisqualified(url, dld, ret)) {
|
||||
documentLengthLogic.validateLength(dld);
|
||||
if (isDisqualified(url, ret)) {
|
||||
throw new DisqualifiedException(DisqualificationReason.QUALITY);
|
||||
}
|
||||
|
||||
@ -128,6 +129,8 @@ public class HtmlDocumentProcessorPlugin extends AbstractDocumentProcessorPlugin
|
||||
PubDate pubDate = pubDateSniffer.getPubDate(crawledDocument.headers, url, doc, ret.standard, true);
|
||||
EnumSet<DocumentFlags> documentFlags = htmlFeatures2DocumentFlags(ret.features);
|
||||
|
||||
documentLengthLogic.setLengthFlags(ret.length, documentFlags);
|
||||
|
||||
ret.metadata = new DocumentMetadata(url.depth(), pubDate.yearByte(), 0, (int) -ret.quality, documentFlags);
|
||||
|
||||
DocumentKeywordsBuilder words = keywordExtractor.extractKeywords(dld, url);
|
||||
@ -138,7 +141,7 @@ public class HtmlDocumentProcessorPlugin extends AbstractDocumentProcessorPlugin
|
||||
.addUrl(url)
|
||||
.addFeatures(ret.features)
|
||||
.addFormat(ret.standard)
|
||||
.build(words);
|
||||
.build();
|
||||
|
||||
words.addAllSyntheticTerms(tagWords);
|
||||
|
||||
@ -179,13 +182,11 @@ public class HtmlDocumentProcessorPlugin extends AbstractDocumentProcessorPlugin
|
||||
|
||||
private static final GuardedRegex mastodonFeedRegex = GuardedRegexFactory.startsWith("/@", "^/@[^/]+/?$");
|
||||
|
||||
private boolean isDisqualified(EdgeUrl url, DocumentLanguageData dld, ProcessedDocumentDetails ret) {
|
||||
private boolean isDisqualified(EdgeUrl url, ProcessedDocumentDetails ret) {
|
||||
if (ret.quality < minDocumentQuality) {
|
||||
return true;
|
||||
}
|
||||
if (dld.totalNumWords() < minDocumentLength) {
|
||||
return true;
|
||||
}
|
||||
|
||||
// These pages shouldn't be publicly accessible
|
||||
if ("phpinfo()".equals(ret.title)) {
|
||||
return true;
|
||||
|
@ -2,6 +2,7 @@ package nu.marginalia.converting.processor.plugin;
|
||||
|
||||
import com.google.inject.Inject;
|
||||
import com.google.inject.name.Named;
|
||||
import nu.marginalia.converting.processor.logic.DocumentLengthLogic;
|
||||
import nu.marginalia.crawling.model.CrawledDocument;
|
||||
import nu.marginalia.crawling.model.CrawledDomain;
|
||||
import nu.marginalia.keyword.DocumentKeywordExtractor;
|
||||
@ -28,20 +29,21 @@ import java.util.List;
|
||||
|
||||
public class PlainTextDocumentProcessorPlugin extends AbstractDocumentProcessorPlugin {
|
||||
|
||||
private final int minDocumentLength;
|
||||
private final int maxTitleLength;
|
||||
private final SentenceExtractor sentenceExtractor;
|
||||
private final DocumentKeywordExtractor keywordExtractor;
|
||||
private final PlainTextLogic plainTextLogic = new PlainTextLogic();
|
||||
private final DocumentLengthLogic documentLengthLogic;
|
||||
|
||||
|
||||
@Inject
|
||||
public PlainTextDocumentProcessorPlugin(@Named("min-document-length") Integer minDocumentLength,
|
||||
@Named("max-title-length") Integer maxTitleLength,
|
||||
public PlainTextDocumentProcessorPlugin(@Named("max-title-length") Integer maxTitleLength,
|
||||
SentenceExtractor sentenceExtractor,
|
||||
DocumentKeywordExtractor keywordExtractor)
|
||||
DocumentKeywordExtractor keywordExtractor,
|
||||
DocumentLengthLogic documentLengthLogic
|
||||
)
|
||||
{
|
||||
this.minDocumentLength = minDocumentLength;
|
||||
this.documentLengthLogic = documentLengthLogic;
|
||||
this.maxTitleLength = maxTitleLength;
|
||||
this.sentenceExtractor = sentenceExtractor;
|
||||
this.keywordExtractor = keywordExtractor;
|
||||
@ -68,15 +70,14 @@ public class PlainTextDocumentProcessorPlugin extends AbstractDocumentProcessorP
|
||||
|
||||
checkDocumentLanguage(dld);
|
||||
|
||||
if (dld.totalNumWords() < minDocumentLength) {
|
||||
throw new DisqualifiedException(DisqualifiedException.DisqualificationReason.LENGTH);
|
||||
}
|
||||
documentLengthLogic.validateLength(dld);
|
||||
|
||||
var ret = new ProcessedDocumentDetails();
|
||||
|
||||
List<String> firstFewLines = LineUtils.firstNLines(documentBody, 40);
|
||||
|
||||
ret.length = documentBody.length();
|
||||
|
||||
ret.standard = HtmlStandard.PLAIN;
|
||||
ret.title = StringUtils.truncate(plainTextLogic.getTitle(url, firstFewLines), maxTitleLength);
|
||||
|
||||
@ -88,7 +89,11 @@ public class PlainTextDocumentProcessorPlugin extends AbstractDocumentProcessorP
|
||||
|
||||
final PubDate pubDate = new PubDate(LocalDate.ofYearDay(1993, 1));
|
||||
|
||||
ret.metadata = new DocumentMetadata(url.depth(), pubDate.yearByte(), 0, (int) -ret.quality, EnumSet.of(DocumentFlags.PlainText));
|
||||
EnumSet<DocumentFlags> documentFlags = EnumSet.of(DocumentFlags.PlainText);
|
||||
|
||||
documentLengthLogic.setLengthFlags(ret.length, documentFlags);
|
||||
|
||||
ret.metadata = new DocumentMetadata(url.depth(), pubDate.yearByte(), 0, (int) -ret.quality, documentFlags);
|
||||
|
||||
DocumentKeywordsBuilder words = keywordExtractor.extractKeywords(dld, url);
|
||||
|
||||
@ -98,7 +103,7 @@ public class PlainTextDocumentProcessorPlugin extends AbstractDocumentProcessorP
|
||||
.addUrl(url)
|
||||
.addFeatures(ret.features)
|
||||
.addFormat(ret.standard)
|
||||
.build(words);
|
||||
.build();
|
||||
|
||||
words.addAllSyntheticTerms(tagWords);
|
||||
|
||||
|
Loading…
Reference in New Issue
Block a user