(convert) Loosen up the rules enforcement for documents that have external links.

This commit is contained in:
Viktor Lofgren 2023-12-01 17:44:29 +01:00
parent c984a97262
commit f615cf2391
7 changed files with 55 additions and 16 deletions

View File

@ -0,0 +1,25 @@
package nu.marginalia.converting.processor;
/** Depending on external factors, such as how often a document is linked,
* quality and length rules are selectively enforced.
*/
public enum DocumentClass {
NORMAL,
EXTERNALLY_LINKED_ONCE,
EXTERNALLY_LINKED_MULTI;
public boolean enforceQualityLimits() {
return this != EXTERNALLY_LINKED_MULTI;
}
/** This factor is multiplied onto the length of the document
* when determining whether it's sufficiently long to be indexed
*/
public double lengthLimitModifier() {
return switch (this) {
case NORMAL -> 1.0;
case EXTERNALLY_LINKED_ONCE -> 2.;
case EXTERNALLY_LINKED_MULTI -> 10.;
};
}
}

View File

@ -1,6 +1,7 @@
package nu.marginalia.converting.processor;
import com.google.inject.Inject;
import nu.marginalia.atags.model.DomainLinks;
import nu.marginalia.crawling.model.CrawledDocument;
import nu.marginalia.crawling.model.CrawlerDocumentStatus;
import nu.marginalia.model.crawl.UrlIndexingState;
@ -37,14 +38,20 @@ public class DocumentProcessor {
processorPlugins.add(plainTextDocumentProcessorPlugin);
}
public ProcessedDocument process(CrawledDocument crawledDocument) {
public ProcessedDocument process(CrawledDocument crawledDocument, DomainLinks externalDomainLinks) {
ProcessedDocument ret = new ProcessedDocument();
try {
// We must always provide the URL, even if we don't process the document
ret.url = getDocumentUrl(crawledDocument);
processDocument(crawledDocument, ret);
DocumentClass documentClass = switch (externalDomainLinks.countForUrl(ret.url)) {
case 0 -> DocumentClass.NORMAL;
case 1 -> DocumentClass.EXTERNALLY_LINKED_ONCE;
default -> DocumentClass.EXTERNALLY_LINKED_MULTI;
};
processDocument(crawledDocument, documentClass, ret);
}
catch (DisqualifiedException ex) {
ret.state = UrlIndexingState.DISQUALIFIED;
@ -60,7 +67,7 @@ public class DocumentProcessor {
return ret;
}
private void processDocument(CrawledDocument crawledDocument, ProcessedDocument ret) throws URISyntaxException, DisqualifiedException {
private void processDocument(CrawledDocument crawledDocument, DocumentClass documentClass, ProcessedDocument ret) throws URISyntaxException, DisqualifiedException {
var crawlerStatus = CrawlerDocumentStatus.valueOf(crawledDocument.crawlerStatus);
if (crawlerStatus != CrawlerDocumentStatus.OK) {
@ -79,7 +86,7 @@ public class DocumentProcessor {
final var plugin = findPlugin(crawledDocument);
AbstractDocumentProcessorPlugin.DetailsWithWords detailsWithWords = plugin.createDetails(crawledDocument);
AbstractDocumentProcessorPlugin.DetailsWithWords detailsWithWords = plugin.createDetails(crawledDocument, documentClass);
ret.details = detailsWithWords.details();
ret.words = detailsWithWords.words();

View File

@ -53,6 +53,9 @@ public class DomainProcessor {
boolean cookies = false;
String ip = "";
DomainLinks externalDomainLinks = anchorTagsSource.getAnchorTags(ret.domain);
while (dataStream.hasNext()) {
var data = dataStream.next();
@ -75,7 +78,7 @@ public class DomainProcessor {
continue;
fixBadCanonicalTag(doc);
docs.add(documentProcessor.process(doc));
docs.add(documentProcessor.process(doc, externalDomainLinks));
}
catch (Exception ex) {
logger.warn("Failed to process " + doc.url, ex);
@ -91,8 +94,6 @@ public class DomainProcessor {
terms.add(HtmlFeature.COOKIES.getKeyword());
}
var externalDomainLinks = anchorTagsSource.getAnchorTags(ret.domain);
for (var document : ret.documents) {
if (document.details == null)
continue;

View File

@ -1,5 +1,6 @@
package nu.marginalia.converting.processor.plugin;
import nu.marginalia.converting.processor.DocumentClass;
import nu.marginalia.crawling.model.CrawledDocument;
import nu.marginalia.converting.language.LanguageFilter;
import nu.marginalia.language.model.DocumentLanguageData;
@ -21,7 +22,7 @@ public abstract class AbstractDocumentProcessorPlugin {
this.languageFilter = languageFilter;
}
public abstract DetailsWithWords createDetails(CrawledDocument crawledDocument) throws DisqualifiedException, URISyntaxException;
public abstract DetailsWithWords createDetails(CrawledDocument crawledDocument, DocumentClass documentClass) throws DisqualifiedException, URISyntaxException;
public abstract boolean isApplicable(CrawledDocument doc);
protected void checkDocumentLanguage(DocumentLanguageData dld) throws DisqualifiedException {

View File

@ -4,6 +4,7 @@ import com.google.inject.Inject;
import com.google.inject.name.Named;
import nu.marginalia.converting.language.LanguageFilter;
import nu.marginalia.converting.model.GeneratorType;
import nu.marginalia.converting.processor.DocumentClass;
import nu.marginalia.converting.processor.MetaRobotsTag;
import nu.marginalia.converting.processor.logic.dom.MeasureLengthVisitor;
import nu.marginalia.converting.processor.logic.links.FileLinks;
@ -15,7 +16,6 @@ import nu.marginalia.model.crawl.HtmlFeature;
import nu.marginalia.link_parser.LinkParser;
import nu.marginalia.crawling.model.CrawledDocument;
import nu.marginalia.keyword.DocumentKeywordExtractor;
import nu.marginalia.language.sentence.SentenceExtractor;
import nu.marginalia.model.html.HtmlStandard;
import nu.marginalia.model.idx.DocumentFlags;
import nu.marginalia.keyword.model.DocumentKeywordsBuilder;
@ -98,7 +98,7 @@ public class HtmlDocumentProcessorPlugin extends AbstractDocumentProcessorPlugin
}
@Override
public DetailsWithWords createDetails(CrawledDocument crawledDocument)
public DetailsWithWords createDetails(CrawledDocument crawledDocument, DocumentClass documentClass)
throws DisqualifiedException, URISyntaxException {
String documentBody = crawledDocument.documentBody;
@ -140,8 +140,9 @@ public class HtmlDocumentProcessorPlugin extends AbstractDocumentProcessorPlugin
// don't move this up! it uses title and quality
// and is run before the heavy computations below
documentLengthLogic.validateLength(dld, specialization.lengthModifier());
if (isDisqualified(url, ret)) {
documentLengthLogic.validateLength(dld, specialization.lengthModifier() * documentClass.lengthLimitModifier());
if (isDisqualified(documentClass, url, ret)) {
throw new DisqualifiedException(DisqualificationReason.QUALITY);
}
@ -205,9 +206,11 @@ public class HtmlDocumentProcessorPlugin extends AbstractDocumentProcessorPlugin
private static final GuardedRegex mastodonFeedRegex = GuardedRegexFactory.startsWith("/@", "^/@[^/]+/?$");
private boolean isDisqualified(EdgeUrl url, ProcessedDocumentDetails ret) {
private boolean isDisqualified(DocumentClass documentClass, EdgeUrl url, ProcessedDocumentDetails ret) {
if (ret.quality < minDocumentQuality) {
if (documentClass.enforceQualityLimits()
&& ret.quality < minDocumentQuality)
{
return true;
}

View File

@ -3,6 +3,7 @@ package nu.marginalia.converting.processor.plugin;
import com.google.inject.Inject;
import com.google.inject.name.Named;
import nu.marginalia.converting.language.LanguageFilter;
import nu.marginalia.converting.processor.DocumentClass;
import nu.marginalia.converting.processor.logic.DocumentLengthLogic;
import nu.marginalia.crawling.model.CrawledDocument;
import nu.marginalia.keyword.DocumentKeywordExtractor;
@ -57,7 +58,7 @@ public class PlainTextDocumentProcessorPlugin extends AbstractDocumentProcessorP
}
@Override
public DetailsWithWords createDetails(CrawledDocument crawledDocument)
public DetailsWithWords createDetails(CrawledDocument crawledDocument, DocumentClass documentClass)
throws DisqualifiedException, URISyntaxException {
String documentBody = crawledDocument.documentBody;

View File

@ -5,6 +5,7 @@ import com.google.inject.Singleton;
import nu.marginalia.atags.model.DomainLinks;
import nu.marginalia.converting.model.GeneratorType;
import nu.marginalia.converting.model.ProcessedDocument;
import nu.marginalia.converting.processor.DocumentClass;
import nu.marginalia.converting.processor.plugin.HtmlDocumentProcessorPlugin;
import nu.marginalia.crawling.model.CrawledDocument;
import nu.marginalia.model.EdgeUrl;
@ -54,7 +55,7 @@ public class SideloaderProcessing {
var ret = new ProcessedDocument();
try {
var details = htmlProcessorPlugin.createDetails(crawledDoc);
var details = htmlProcessorPlugin.createDetails(crawledDoc, DocumentClass.NORMAL);
ret.words = details.words();