diff --git a/code/common/model/src/main/java/nu/marginalia/model/idx/DocumentFlags.java b/code/common/model/src/main/java/nu/marginalia/model/idx/DocumentFlags.java index c0d097df..b939a17a 100644 --- a/code/common/model/src/main/java/nu/marginalia/model/idx/DocumentFlags.java +++ b/code/common/model/src/main/java/nu/marginalia/model/idx/DocumentFlags.java @@ -7,8 +7,8 @@ public enum DocumentFlags { PlainText, Ads, Tracking, - UnusedBit4, - UnusedBit5, + ShortDocument, + LongDocument, UnusedBit6, UnusedBit7, ; diff --git a/code/processes/converting-process/src/main/java/nu/marginalia/converting/processor/logic/DocumentLengthLogic.java b/code/processes/converting-process/src/main/java/nu/marginalia/converting/processor/logic/DocumentLengthLogic.java new file mode 100644 index 00000000..797cf2b9 --- /dev/null +++ b/code/processes/converting-process/src/main/java/nu/marginalia/converting/processor/logic/DocumentLengthLogic.java @@ -0,0 +1,36 @@ +package nu.marginalia.converting.processor.logic; + +import com.google.inject.Inject; +import com.google.inject.Singleton; +import com.google.inject.name.Named; +import nu.marginalia.converting.model.DisqualifiedException; +import nu.marginalia.language.model.DocumentLanguageData; +import nu.marginalia.model.idx.DocumentFlags; + +import java.util.EnumSet; + +@Singleton +public class DocumentLengthLogic { + private final int minDocumentLength; + private final int shortDocumentLength = 2500; + private final int longDocumentLength = 7500; + + @Inject + public DocumentLengthLogic(@Named("min-document-length") Integer minDocumentLength) { + this.minDocumentLength = minDocumentLength; + } + + public void setLengthFlags(int lengthTextInChars, EnumSet flags) { + if (lengthTextInChars < shortDocumentLength) + flags.add(DocumentFlags.ShortDocument); + else if (lengthTextInChars > longDocumentLength) + flags.add(DocumentFlags.LongDocument); + } + + public void validateLength(DocumentLanguageData dld) throws DisqualifiedException { + if (dld.totalNumWords() < minDocumentLength) { + throw new DisqualifiedException(DisqualifiedException.DisqualificationReason.LENGTH); + } + } + +} diff --git a/code/processes/converting-process/src/main/java/nu/marginalia/converting/processor/logic/DocumentValuator.java b/code/processes/converting-process/src/main/java/nu/marginalia/converting/processor/logic/DocumentValuator.java index a1542b16..6c3bf267 100644 --- a/code/processes/converting-process/src/main/java/nu/marginalia/converting/processor/logic/DocumentValuator.java +++ b/code/processes/converting-process/src/main/java/nu/marginalia/converting/processor/logic/DocumentValuator.java @@ -11,19 +11,7 @@ import java.util.Set; public class DocumentValuator { - private static final Set filthTable = Set.of( - "xxx", "sex", "anal", "sexy", - "bdsm", "fetish", "porn", "camgirls", "dildo", - "gangbang", "buttplug", "orgasm", "vibrator", - "cameltoe", "download", "iso", "botox", "torrent", - "jackpot", "vegas", "casino", "coinbase", "poloniex", - "myetherwallet", "ethereum", "binance", "bitcoin", - "litecoin", "seo", "serp" - - ); - - public double getQuality(CrawledDocument crawledDocument, HtmlStandard htmlStandard, Document parsedDocument, DocumentLanguageData dld) throws DisqualifiedException { - double smutCoefficient = dld.streamLowerCase().filter(filthTable::contains).count(); + public double getQuality(CrawledDocument crawledDocument, HtmlStandard htmlStandard, Document parsedDocument) throws DisqualifiedException { double scriptPenalty = getScriptPenalty(parsedDocument); int textBodyLength = parsedDocument.text().length(); @@ -35,8 +23,7 @@ public class DocumentValuator { return Math.log(textBodyLength / (double) (1+rawLength))*htmlStandard.scale + htmlStandard.offset - - scriptPenalty - - smutCoefficient; + - scriptPenalty; } diff --git a/code/processes/converting-process/src/main/java/nu/marginalia/converting/processor/logic/TitleExtractor.java b/code/processes/converting-process/src/main/java/nu/marginalia/converting/processor/logic/TitleExtractor.java index 4a3293d7..920da41c 100644 --- a/code/processes/converting-process/src/main/java/nu/marginalia/converting/processor/logic/TitleExtractor.java +++ b/code/processes/converting-process/src/main/java/nu/marginalia/converting/processor/logic/TitleExtractor.java @@ -13,7 +13,6 @@ public class TitleExtractor { @Inject public TitleExtractor(@Named("max-title-length") Integer maxTitleLength) { this.maxTitleLength = maxTitleLength; - } public String getTitleAbbreviated(Document doc, DocumentLanguageData dld, String url) { diff --git a/code/processes/converting-process/src/main/java/nu/marginalia/converting/processor/plugin/AbstractDocumentProcessorPlugin.java b/code/processes/converting-process/src/main/java/nu/marginalia/converting/processor/plugin/AbstractDocumentProcessorPlugin.java index fa7fd118..48654280 100644 --- a/code/processes/converting-process/src/main/java/nu/marginalia/converting/processor/plugin/AbstractDocumentProcessorPlugin.java +++ b/code/processes/converting-process/src/main/java/nu/marginalia/converting/processor/plugin/AbstractDocumentProcessorPlugin.java @@ -32,7 +32,7 @@ public abstract class AbstractDocumentProcessorPlugin { protected static class MetaTagsBuilder { private final Set tagWords = new HashSet<>(); - public Set build(DocumentKeywordsBuilder dest) { + public Set build() { return tagWords; } diff --git a/code/processes/converting-process/src/main/java/nu/marginalia/converting/processor/plugin/HtmlDocumentProcessorPlugin.java b/code/processes/converting-process/src/main/java/nu/marginalia/converting/processor/plugin/HtmlDocumentProcessorPlugin.java index d080b535..d06cbeeb 100644 --- a/code/processes/converting-process/src/main/java/nu/marginalia/converting/processor/plugin/HtmlDocumentProcessorPlugin.java +++ b/code/processes/converting-process/src/main/java/nu/marginalia/converting/processor/plugin/HtmlDocumentProcessorPlugin.java @@ -5,6 +5,7 @@ import com.google.inject.name.Named; import nu.marginalia.converting.processor.MetaRobotsTag; import nu.marginalia.converting.processor.logic.dom.DomPruningFilter; import nu.marginalia.converting.processor.logic.links.LinkProcessor; +import nu.marginalia.language.model.DocumentLanguageData; import nu.marginalia.model.crawl.HtmlFeature; import nu.marginalia.summary.SummaryExtractor; import nu.marginalia.link_parser.LinkParser; @@ -16,7 +17,6 @@ import nu.marginalia.converting.model.HtmlStandard; import nu.marginalia.model.idx.DocumentFlags; import nu.marginalia.keyword.model.DocumentKeywordsBuilder; import nu.marginalia.model.idx.DocumentMetadata; -import nu.marginalia.language.model.DocumentLanguageData; import nu.marginalia.converting.processor.logic.*; import nu.marginalia.model.crawl.PubDate; import nu.marginalia.gregex.GuardedRegex; @@ -40,7 +40,6 @@ import static nu.marginalia.converting.model.DisqualifiedException.*; public class HtmlDocumentProcessorPlugin extends AbstractDocumentProcessorPlugin { - private final int minDocumentLength; private final double minDocumentQuality; private final SentenceExtractor sentenceExtractor; @@ -50,6 +49,8 @@ public class HtmlDocumentProcessorPlugin extends AbstractDocumentProcessorPlugin private final SummaryExtractor summaryExtractor; private final PubDateSniffer pubDateSniffer; + private final DocumentLengthLogic documentLengthLogic; + private final MetaRobotsTag metaRobotsTag; private static final DocumentValuator documentValuator = new DocumentValuator(); @@ -57,16 +58,17 @@ public class HtmlDocumentProcessorPlugin extends AbstractDocumentProcessorPlugin private static final FeedExtractor feedExtractor = new FeedExtractor(linkParser); @Inject - public HtmlDocumentProcessorPlugin(@Named("min-document-length") Integer minDocumentLength, - @Named("min-document-quality") Double minDocumentQuality, - SentenceExtractor sentenceExtractor, - FeatureExtractor featureExtractor, - TitleExtractor titleExtractor, - DocumentKeywordExtractor keywordExtractor, - SummaryExtractor summaryExtractor, - PubDateSniffer pubDateSniffer, - MetaRobotsTag metaRobotsTag) { - this.minDocumentLength = minDocumentLength; + public HtmlDocumentProcessorPlugin( + @Named("min-document-quality") Double minDocumentQuality, + SentenceExtractor sentenceExtractor, + FeatureExtractor featureExtractor, + TitleExtractor titleExtractor, + DocumentKeywordExtractor keywordExtractor, + SummaryExtractor summaryExtractor, + PubDateSniffer pubDateSniffer, + DocumentLengthLogic documentLengthLogic, + MetaRobotsTag metaRobotsTag) { + this.documentLengthLogic = documentLengthLogic; this.minDocumentQuality = minDocumentQuality; this.sentenceExtractor = sentenceExtractor; this.featureExtractor = featureExtractor; @@ -102,9 +104,7 @@ public class HtmlDocumentProcessorPlugin extends AbstractDocumentProcessorPlugin final EdgeUrl url = new EdgeUrl(crawledDocument.url); - Document prunedDoc = prune(doc); - - var dld = sentenceExtractor.extractSentences(prunedDoc); + DocumentLanguageData dld = sentenceExtractor.extractSentences(prune(doc)); checkDocumentLanguage(dld); @@ -113,11 +113,12 @@ public class HtmlDocumentProcessorPlugin extends AbstractDocumentProcessorPlugin ret.length = getLength(doc); ret.standard = getHtmlStandard(doc); ret.title = titleExtractor.getTitleAbbreviated(doc, dld, crawledDocument.url); - ret.quality = documentValuator.getQuality(crawledDocument, ret.standard, doc, dld); + ret.quality = documentValuator.getQuality(crawledDocument, ret.standard, doc); // don't move this up! it uses title and quality // and is run before the heavy computations below - if (isDisqualified(url, dld, ret)) { + documentLengthLogic.validateLength(dld); + if (isDisqualified(url, ret)) { throw new DisqualifiedException(DisqualificationReason.QUALITY); } @@ -128,6 +129,8 @@ public class HtmlDocumentProcessorPlugin extends AbstractDocumentProcessorPlugin PubDate pubDate = pubDateSniffer.getPubDate(crawledDocument.headers, url, doc, ret.standard, true); EnumSet documentFlags = htmlFeatures2DocumentFlags(ret.features); + documentLengthLogic.setLengthFlags(ret.length, documentFlags); + ret.metadata = new DocumentMetadata(url.depth(), pubDate.yearByte(), 0, (int) -ret.quality, documentFlags); DocumentKeywordsBuilder words = keywordExtractor.extractKeywords(dld, url); @@ -138,7 +141,7 @@ public class HtmlDocumentProcessorPlugin extends AbstractDocumentProcessorPlugin .addUrl(url) .addFeatures(ret.features) .addFormat(ret.standard) - .build(words); + .build(); words.addAllSyntheticTerms(tagWords); @@ -179,13 +182,11 @@ public class HtmlDocumentProcessorPlugin extends AbstractDocumentProcessorPlugin private static final GuardedRegex mastodonFeedRegex = GuardedRegexFactory.startsWith("/@", "^/@[^/]+/?$"); - private boolean isDisqualified(EdgeUrl url, DocumentLanguageData dld, ProcessedDocumentDetails ret) { + private boolean isDisqualified(EdgeUrl url, ProcessedDocumentDetails ret) { if (ret.quality < minDocumentQuality) { return true; } - if (dld.totalNumWords() < minDocumentLength) { - return true; - } + // These pages shouldn't be publicly accessible if ("phpinfo()".equals(ret.title)) { return true; diff --git a/code/processes/converting-process/src/main/java/nu/marginalia/converting/processor/plugin/PlainTextDocumentProcessorPlugin.java b/code/processes/converting-process/src/main/java/nu/marginalia/converting/processor/plugin/PlainTextDocumentProcessorPlugin.java index c7753fbc..c79b57ae 100644 --- a/code/processes/converting-process/src/main/java/nu/marginalia/converting/processor/plugin/PlainTextDocumentProcessorPlugin.java +++ b/code/processes/converting-process/src/main/java/nu/marginalia/converting/processor/plugin/PlainTextDocumentProcessorPlugin.java @@ -2,6 +2,7 @@ package nu.marginalia.converting.processor.plugin; import com.google.inject.Inject; import com.google.inject.name.Named; +import nu.marginalia.converting.processor.logic.DocumentLengthLogic; import nu.marginalia.crawling.model.CrawledDocument; import nu.marginalia.crawling.model.CrawledDomain; import nu.marginalia.keyword.DocumentKeywordExtractor; @@ -28,20 +29,21 @@ import java.util.List; public class PlainTextDocumentProcessorPlugin extends AbstractDocumentProcessorPlugin { - private final int minDocumentLength; private final int maxTitleLength; private final SentenceExtractor sentenceExtractor; private final DocumentKeywordExtractor keywordExtractor; private final PlainTextLogic plainTextLogic = new PlainTextLogic(); + private final DocumentLengthLogic documentLengthLogic; @Inject - public PlainTextDocumentProcessorPlugin(@Named("min-document-length") Integer minDocumentLength, - @Named("max-title-length") Integer maxTitleLength, + public PlainTextDocumentProcessorPlugin(@Named("max-title-length") Integer maxTitleLength, SentenceExtractor sentenceExtractor, - DocumentKeywordExtractor keywordExtractor) + DocumentKeywordExtractor keywordExtractor, + DocumentLengthLogic documentLengthLogic + ) { - this.minDocumentLength = minDocumentLength; + this.documentLengthLogic = documentLengthLogic; this.maxTitleLength = maxTitleLength; this.sentenceExtractor = sentenceExtractor; this.keywordExtractor = keywordExtractor; @@ -68,15 +70,14 @@ public class PlainTextDocumentProcessorPlugin extends AbstractDocumentProcessorP checkDocumentLanguage(dld); - if (dld.totalNumWords() < minDocumentLength) { - throw new DisqualifiedException(DisqualifiedException.DisqualificationReason.LENGTH); - } + documentLengthLogic.validateLength(dld); var ret = new ProcessedDocumentDetails(); List firstFewLines = LineUtils.firstNLines(documentBody, 40); ret.length = documentBody.length(); + ret.standard = HtmlStandard.PLAIN; ret.title = StringUtils.truncate(plainTextLogic.getTitle(url, firstFewLines), maxTitleLength); @@ -88,7 +89,11 @@ public class PlainTextDocumentProcessorPlugin extends AbstractDocumentProcessorP final PubDate pubDate = new PubDate(LocalDate.ofYearDay(1993, 1)); - ret.metadata = new DocumentMetadata(url.depth(), pubDate.yearByte(), 0, (int) -ret.quality, EnumSet.of(DocumentFlags.PlainText)); + EnumSet documentFlags = EnumSet.of(DocumentFlags.PlainText); + + documentLengthLogic.setLengthFlags(ret.length, documentFlags); + + ret.metadata = new DocumentMetadata(url.depth(), pubDate.yearByte(), 0, (int) -ret.quality, documentFlags); DocumentKeywordsBuilder words = keywordExtractor.extractKeywords(dld, url); @@ -98,7 +103,7 @@ public class PlainTextDocumentProcessorPlugin extends AbstractDocumentProcessorP .addUrl(url) .addFeatures(ret.features) .addFormat(ret.standard) - .build(words); + .build(); words.addAllSyntheticTerms(tagWords);