From f615cf2391d1cbcaf60780fd7a71b84c7370a646 Mon Sep 17 00:00:00 2001 From: Viktor Lofgren Date: Fri, 1 Dec 2023 17:44:29 +0100 Subject: [PATCH] (convert) Loosen up the rules enforcement for documents that have external links. --- .../converting/processor/DocumentClass.java | 25 +++++++++++++++++++ .../processor/DocumentProcessor.java | 15 ++++++++--- .../converting/processor/DomainProcessor.java | 7 +++--- .../AbstractDocumentProcessorPlugin.java | 3 ++- .../plugin/HtmlDocumentProcessorPlugin.java | 15 ++++++----- .../PlainTextDocumentProcessorPlugin.java | 3 ++- .../sideload/SideloaderProcessing.java | 3 ++- 7 files changed, 55 insertions(+), 16 deletions(-) create mode 100644 code/processes/converting-process/src/main/java/nu/marginalia/converting/processor/DocumentClass.java diff --git a/code/processes/converting-process/src/main/java/nu/marginalia/converting/processor/DocumentClass.java b/code/processes/converting-process/src/main/java/nu/marginalia/converting/processor/DocumentClass.java new file mode 100644 index 00000000..ab450a2a --- /dev/null +++ b/code/processes/converting-process/src/main/java/nu/marginalia/converting/processor/DocumentClass.java @@ -0,0 +1,25 @@ +package nu.marginalia.converting.processor; + +/** Depending on external factors, such as how often a document is linked, + * quality and length rules are selectively enforced. + */ +public enum DocumentClass { + NORMAL, + EXTERNALLY_LINKED_ONCE, + EXTERNALLY_LINKED_MULTI; + + public boolean enforceQualityLimits() { + return this != EXTERNALLY_LINKED_MULTI; + } + + /** This factor is multiplied onto the length of the document + * when determining whether it's sufficiently long to be indexed + */ + public double lengthLimitModifier() { + return switch (this) { + case NORMAL -> 1.0; + case EXTERNALLY_LINKED_ONCE -> 2.; + case EXTERNALLY_LINKED_MULTI -> 10.; + }; + } +} diff --git a/code/processes/converting-process/src/main/java/nu/marginalia/converting/processor/DocumentProcessor.java b/code/processes/converting-process/src/main/java/nu/marginalia/converting/processor/DocumentProcessor.java index 82e9c5d7..8e8841a0 100644 --- a/code/processes/converting-process/src/main/java/nu/marginalia/converting/processor/DocumentProcessor.java +++ b/code/processes/converting-process/src/main/java/nu/marginalia/converting/processor/DocumentProcessor.java @@ -1,6 +1,7 @@ package nu.marginalia.converting.processor; import com.google.inject.Inject; +import nu.marginalia.atags.model.DomainLinks; import nu.marginalia.crawling.model.CrawledDocument; import nu.marginalia.crawling.model.CrawlerDocumentStatus; import nu.marginalia.model.crawl.UrlIndexingState; @@ -37,14 +38,20 @@ public class DocumentProcessor { processorPlugins.add(plainTextDocumentProcessorPlugin); } - public ProcessedDocument process(CrawledDocument crawledDocument) { + public ProcessedDocument process(CrawledDocument crawledDocument, DomainLinks externalDomainLinks) { ProcessedDocument ret = new ProcessedDocument(); try { // We must always provide the URL, even if we don't process the document ret.url = getDocumentUrl(crawledDocument); - processDocument(crawledDocument, ret); + DocumentClass documentClass = switch (externalDomainLinks.countForUrl(ret.url)) { + case 0 -> DocumentClass.NORMAL; + case 1 -> DocumentClass.EXTERNALLY_LINKED_ONCE; + default -> DocumentClass.EXTERNALLY_LINKED_MULTI; + }; + + processDocument(crawledDocument, documentClass, ret); } catch (DisqualifiedException ex) { ret.state = UrlIndexingState.DISQUALIFIED; @@ -60,7 +67,7 @@ public class DocumentProcessor { return ret; } - private void processDocument(CrawledDocument crawledDocument, ProcessedDocument ret) throws URISyntaxException, DisqualifiedException { + private void processDocument(CrawledDocument crawledDocument, DocumentClass documentClass, ProcessedDocument ret) throws URISyntaxException, DisqualifiedException { var crawlerStatus = CrawlerDocumentStatus.valueOf(crawledDocument.crawlerStatus); if (crawlerStatus != CrawlerDocumentStatus.OK) { @@ -79,7 +86,7 @@ public class DocumentProcessor { final var plugin = findPlugin(crawledDocument); - AbstractDocumentProcessorPlugin.DetailsWithWords detailsWithWords = plugin.createDetails(crawledDocument); + AbstractDocumentProcessorPlugin.DetailsWithWords detailsWithWords = plugin.createDetails(crawledDocument, documentClass); ret.details = detailsWithWords.details(); ret.words = detailsWithWords.words(); diff --git a/code/processes/converting-process/src/main/java/nu/marginalia/converting/processor/DomainProcessor.java b/code/processes/converting-process/src/main/java/nu/marginalia/converting/processor/DomainProcessor.java index 476dfc16..f5effebd 100644 --- a/code/processes/converting-process/src/main/java/nu/marginalia/converting/processor/DomainProcessor.java +++ b/code/processes/converting-process/src/main/java/nu/marginalia/converting/processor/DomainProcessor.java @@ -53,6 +53,9 @@ public class DomainProcessor { boolean cookies = false; String ip = ""; + + DomainLinks externalDomainLinks = anchorTagsSource.getAnchorTags(ret.domain); + while (dataStream.hasNext()) { var data = dataStream.next(); @@ -75,7 +78,7 @@ public class DomainProcessor { continue; fixBadCanonicalTag(doc); - docs.add(documentProcessor.process(doc)); + docs.add(documentProcessor.process(doc, externalDomainLinks)); } catch (Exception ex) { logger.warn("Failed to process " + doc.url, ex); @@ -91,8 +94,6 @@ public class DomainProcessor { terms.add(HtmlFeature.COOKIES.getKeyword()); } - var externalDomainLinks = anchorTagsSource.getAnchorTags(ret.domain); - for (var document : ret.documents) { if (document.details == null) continue; diff --git a/code/processes/converting-process/src/main/java/nu/marginalia/converting/processor/plugin/AbstractDocumentProcessorPlugin.java b/code/processes/converting-process/src/main/java/nu/marginalia/converting/processor/plugin/AbstractDocumentProcessorPlugin.java index 9dbd4290..0007eeb6 100644 --- a/code/processes/converting-process/src/main/java/nu/marginalia/converting/processor/plugin/AbstractDocumentProcessorPlugin.java +++ b/code/processes/converting-process/src/main/java/nu/marginalia/converting/processor/plugin/AbstractDocumentProcessorPlugin.java @@ -1,5 +1,6 @@ package nu.marginalia.converting.processor.plugin; +import nu.marginalia.converting.processor.DocumentClass; import nu.marginalia.crawling.model.CrawledDocument; import nu.marginalia.converting.language.LanguageFilter; import nu.marginalia.language.model.DocumentLanguageData; @@ -21,7 +22,7 @@ public abstract class AbstractDocumentProcessorPlugin { this.languageFilter = languageFilter; } - public abstract DetailsWithWords createDetails(CrawledDocument crawledDocument) throws DisqualifiedException, URISyntaxException; + public abstract DetailsWithWords createDetails(CrawledDocument crawledDocument, DocumentClass documentClass) throws DisqualifiedException, URISyntaxException; public abstract boolean isApplicable(CrawledDocument doc); protected void checkDocumentLanguage(DocumentLanguageData dld) throws DisqualifiedException { diff --git a/code/processes/converting-process/src/main/java/nu/marginalia/converting/processor/plugin/HtmlDocumentProcessorPlugin.java b/code/processes/converting-process/src/main/java/nu/marginalia/converting/processor/plugin/HtmlDocumentProcessorPlugin.java index 0ab4f66c..4017778e 100644 --- a/code/processes/converting-process/src/main/java/nu/marginalia/converting/processor/plugin/HtmlDocumentProcessorPlugin.java +++ b/code/processes/converting-process/src/main/java/nu/marginalia/converting/processor/plugin/HtmlDocumentProcessorPlugin.java @@ -4,6 +4,7 @@ import com.google.inject.Inject; import com.google.inject.name.Named; import nu.marginalia.converting.language.LanguageFilter; import nu.marginalia.converting.model.GeneratorType; +import nu.marginalia.converting.processor.DocumentClass; import nu.marginalia.converting.processor.MetaRobotsTag; import nu.marginalia.converting.processor.logic.dom.MeasureLengthVisitor; import nu.marginalia.converting.processor.logic.links.FileLinks; @@ -15,7 +16,6 @@ import nu.marginalia.model.crawl.HtmlFeature; import nu.marginalia.link_parser.LinkParser; import nu.marginalia.crawling.model.CrawledDocument; import nu.marginalia.keyword.DocumentKeywordExtractor; -import nu.marginalia.language.sentence.SentenceExtractor; import nu.marginalia.model.html.HtmlStandard; import nu.marginalia.model.idx.DocumentFlags; import nu.marginalia.keyword.model.DocumentKeywordsBuilder; @@ -98,7 +98,7 @@ public class HtmlDocumentProcessorPlugin extends AbstractDocumentProcessorPlugin } @Override - public DetailsWithWords createDetails(CrawledDocument crawledDocument) + public DetailsWithWords createDetails(CrawledDocument crawledDocument, DocumentClass documentClass) throws DisqualifiedException, URISyntaxException { String documentBody = crawledDocument.documentBody; @@ -140,8 +140,9 @@ public class HtmlDocumentProcessorPlugin extends AbstractDocumentProcessorPlugin // don't move this up! it uses title and quality // and is run before the heavy computations below - documentLengthLogic.validateLength(dld, specialization.lengthModifier()); - if (isDisqualified(url, ret)) { + documentLengthLogic.validateLength(dld, specialization.lengthModifier() * documentClass.lengthLimitModifier()); + + if (isDisqualified(documentClass, url, ret)) { throw new DisqualifiedException(DisqualificationReason.QUALITY); } @@ -205,9 +206,11 @@ public class HtmlDocumentProcessorPlugin extends AbstractDocumentProcessorPlugin private static final GuardedRegex mastodonFeedRegex = GuardedRegexFactory.startsWith("/@", "^/@[^/]+/?$"); - private boolean isDisqualified(EdgeUrl url, ProcessedDocumentDetails ret) { + private boolean isDisqualified(DocumentClass documentClass, EdgeUrl url, ProcessedDocumentDetails ret) { - if (ret.quality < minDocumentQuality) { + if (documentClass.enforceQualityLimits() + && ret.quality < minDocumentQuality) + { return true; } diff --git a/code/processes/converting-process/src/main/java/nu/marginalia/converting/processor/plugin/PlainTextDocumentProcessorPlugin.java b/code/processes/converting-process/src/main/java/nu/marginalia/converting/processor/plugin/PlainTextDocumentProcessorPlugin.java index 60de67ed..05a9a210 100644 --- a/code/processes/converting-process/src/main/java/nu/marginalia/converting/processor/plugin/PlainTextDocumentProcessorPlugin.java +++ b/code/processes/converting-process/src/main/java/nu/marginalia/converting/processor/plugin/PlainTextDocumentProcessorPlugin.java @@ -3,6 +3,7 @@ package nu.marginalia.converting.processor.plugin; import com.google.inject.Inject; import com.google.inject.name.Named; import nu.marginalia.converting.language.LanguageFilter; +import nu.marginalia.converting.processor.DocumentClass; import nu.marginalia.converting.processor.logic.DocumentLengthLogic; import nu.marginalia.crawling.model.CrawledDocument; import nu.marginalia.keyword.DocumentKeywordExtractor; @@ -57,7 +58,7 @@ public class PlainTextDocumentProcessorPlugin extends AbstractDocumentProcessorP } @Override - public DetailsWithWords createDetails(CrawledDocument crawledDocument) + public DetailsWithWords createDetails(CrawledDocument crawledDocument, DocumentClass documentClass) throws DisqualifiedException, URISyntaxException { String documentBody = crawledDocument.documentBody; diff --git a/code/processes/converting-process/src/main/java/nu/marginalia/converting/sideload/SideloaderProcessing.java b/code/processes/converting-process/src/main/java/nu/marginalia/converting/sideload/SideloaderProcessing.java index a7aa70ba..65f0bd41 100644 --- a/code/processes/converting-process/src/main/java/nu/marginalia/converting/sideload/SideloaderProcessing.java +++ b/code/processes/converting-process/src/main/java/nu/marginalia/converting/sideload/SideloaderProcessing.java @@ -5,6 +5,7 @@ import com.google.inject.Singleton; import nu.marginalia.atags.model.DomainLinks; import nu.marginalia.converting.model.GeneratorType; import nu.marginalia.converting.model.ProcessedDocument; +import nu.marginalia.converting.processor.DocumentClass; import nu.marginalia.converting.processor.plugin.HtmlDocumentProcessorPlugin; import nu.marginalia.crawling.model.CrawledDocument; import nu.marginalia.model.EdgeUrl; @@ -54,7 +55,7 @@ public class SideloaderProcessing { var ret = new ProcessedDocument(); try { - var details = htmlProcessorPlugin.createDetails(crawledDoc); + var details = htmlProcessorPlugin.createDetails(crawledDoc, DocumentClass.NORMAL); ret.words = details.words();