mirror of
https://github.com/MarginaliaSearch/MarginaliaSearch.git
synced 2025-02-23 21:18:58 +00:00
(convert) Loosen up the rules enforcement for documents that have external links.
This commit is contained in:
parent
c984a97262
commit
f615cf2391
@ -0,0 +1,25 @@
|
||||
package nu.marginalia.converting.processor;
|
||||
|
||||
/** Depending on external factors, such as how often a document is linked,
|
||||
* quality and length rules are selectively enforced.
|
||||
*/
|
||||
public enum DocumentClass {
|
||||
NORMAL,
|
||||
EXTERNALLY_LINKED_ONCE,
|
||||
EXTERNALLY_LINKED_MULTI;
|
||||
|
||||
public boolean enforceQualityLimits() {
|
||||
return this != EXTERNALLY_LINKED_MULTI;
|
||||
}
|
||||
|
||||
/** This factor is multiplied onto the length of the document
|
||||
* when determining whether it's sufficiently long to be indexed
|
||||
*/
|
||||
public double lengthLimitModifier() {
|
||||
return switch (this) {
|
||||
case NORMAL -> 1.0;
|
||||
case EXTERNALLY_LINKED_ONCE -> 2.;
|
||||
case EXTERNALLY_LINKED_MULTI -> 10.;
|
||||
};
|
||||
}
|
||||
}
|
@ -1,6 +1,7 @@
|
||||
package nu.marginalia.converting.processor;
|
||||
|
||||
import com.google.inject.Inject;
|
||||
import nu.marginalia.atags.model.DomainLinks;
|
||||
import nu.marginalia.crawling.model.CrawledDocument;
|
||||
import nu.marginalia.crawling.model.CrawlerDocumentStatus;
|
||||
import nu.marginalia.model.crawl.UrlIndexingState;
|
||||
@ -37,14 +38,20 @@ public class DocumentProcessor {
|
||||
processorPlugins.add(plainTextDocumentProcessorPlugin);
|
||||
}
|
||||
|
||||
public ProcessedDocument process(CrawledDocument crawledDocument) {
|
||||
public ProcessedDocument process(CrawledDocument crawledDocument, DomainLinks externalDomainLinks) {
|
||||
ProcessedDocument ret = new ProcessedDocument();
|
||||
|
||||
try {
|
||||
// We must always provide the URL, even if we don't process the document
|
||||
ret.url = getDocumentUrl(crawledDocument);
|
||||
|
||||
processDocument(crawledDocument, ret);
|
||||
DocumentClass documentClass = switch (externalDomainLinks.countForUrl(ret.url)) {
|
||||
case 0 -> DocumentClass.NORMAL;
|
||||
case 1 -> DocumentClass.EXTERNALLY_LINKED_ONCE;
|
||||
default -> DocumentClass.EXTERNALLY_LINKED_MULTI;
|
||||
};
|
||||
|
||||
processDocument(crawledDocument, documentClass, ret);
|
||||
}
|
||||
catch (DisqualifiedException ex) {
|
||||
ret.state = UrlIndexingState.DISQUALIFIED;
|
||||
@ -60,7 +67,7 @@ public class DocumentProcessor {
|
||||
return ret;
|
||||
}
|
||||
|
||||
private void processDocument(CrawledDocument crawledDocument, ProcessedDocument ret) throws URISyntaxException, DisqualifiedException {
|
||||
private void processDocument(CrawledDocument crawledDocument, DocumentClass documentClass, ProcessedDocument ret) throws URISyntaxException, DisqualifiedException {
|
||||
|
||||
var crawlerStatus = CrawlerDocumentStatus.valueOf(crawledDocument.crawlerStatus);
|
||||
if (crawlerStatus != CrawlerDocumentStatus.OK) {
|
||||
@ -79,7 +86,7 @@ public class DocumentProcessor {
|
||||
|
||||
final var plugin = findPlugin(crawledDocument);
|
||||
|
||||
AbstractDocumentProcessorPlugin.DetailsWithWords detailsWithWords = plugin.createDetails(crawledDocument);
|
||||
AbstractDocumentProcessorPlugin.DetailsWithWords detailsWithWords = plugin.createDetails(crawledDocument, documentClass);
|
||||
|
||||
ret.details = detailsWithWords.details();
|
||||
ret.words = detailsWithWords.words();
|
||||
|
@ -53,6 +53,9 @@ public class DomainProcessor {
|
||||
|
||||
boolean cookies = false;
|
||||
String ip = "";
|
||||
|
||||
DomainLinks externalDomainLinks = anchorTagsSource.getAnchorTags(ret.domain);
|
||||
|
||||
while (dataStream.hasNext()) {
|
||||
var data = dataStream.next();
|
||||
|
||||
@ -75,7 +78,7 @@ public class DomainProcessor {
|
||||
continue;
|
||||
fixBadCanonicalTag(doc);
|
||||
|
||||
docs.add(documentProcessor.process(doc));
|
||||
docs.add(documentProcessor.process(doc, externalDomainLinks));
|
||||
}
|
||||
catch (Exception ex) {
|
||||
logger.warn("Failed to process " + doc.url, ex);
|
||||
@ -91,8 +94,6 @@ public class DomainProcessor {
|
||||
terms.add(HtmlFeature.COOKIES.getKeyword());
|
||||
}
|
||||
|
||||
var externalDomainLinks = anchorTagsSource.getAnchorTags(ret.domain);
|
||||
|
||||
for (var document : ret.documents) {
|
||||
if (document.details == null)
|
||||
continue;
|
||||
|
@ -1,5 +1,6 @@
|
||||
package nu.marginalia.converting.processor.plugin;
|
||||
|
||||
import nu.marginalia.converting.processor.DocumentClass;
|
||||
import nu.marginalia.crawling.model.CrawledDocument;
|
||||
import nu.marginalia.converting.language.LanguageFilter;
|
||||
import nu.marginalia.language.model.DocumentLanguageData;
|
||||
@ -21,7 +22,7 @@ public abstract class AbstractDocumentProcessorPlugin {
|
||||
this.languageFilter = languageFilter;
|
||||
}
|
||||
|
||||
public abstract DetailsWithWords createDetails(CrawledDocument crawledDocument) throws DisqualifiedException, URISyntaxException;
|
||||
public abstract DetailsWithWords createDetails(CrawledDocument crawledDocument, DocumentClass documentClass) throws DisqualifiedException, URISyntaxException;
|
||||
public abstract boolean isApplicable(CrawledDocument doc);
|
||||
|
||||
protected void checkDocumentLanguage(DocumentLanguageData dld) throws DisqualifiedException {
|
||||
|
@ -4,6 +4,7 @@ import com.google.inject.Inject;
|
||||
import com.google.inject.name.Named;
|
||||
import nu.marginalia.converting.language.LanguageFilter;
|
||||
import nu.marginalia.converting.model.GeneratorType;
|
||||
import nu.marginalia.converting.processor.DocumentClass;
|
||||
import nu.marginalia.converting.processor.MetaRobotsTag;
|
||||
import nu.marginalia.converting.processor.logic.dom.MeasureLengthVisitor;
|
||||
import nu.marginalia.converting.processor.logic.links.FileLinks;
|
||||
@ -15,7 +16,6 @@ import nu.marginalia.model.crawl.HtmlFeature;
|
||||
import nu.marginalia.link_parser.LinkParser;
|
||||
import nu.marginalia.crawling.model.CrawledDocument;
|
||||
import nu.marginalia.keyword.DocumentKeywordExtractor;
|
||||
import nu.marginalia.language.sentence.SentenceExtractor;
|
||||
import nu.marginalia.model.html.HtmlStandard;
|
||||
import nu.marginalia.model.idx.DocumentFlags;
|
||||
import nu.marginalia.keyword.model.DocumentKeywordsBuilder;
|
||||
@ -98,7 +98,7 @@ public class HtmlDocumentProcessorPlugin extends AbstractDocumentProcessorPlugin
|
||||
}
|
||||
|
||||
@Override
|
||||
public DetailsWithWords createDetails(CrawledDocument crawledDocument)
|
||||
public DetailsWithWords createDetails(CrawledDocument crawledDocument, DocumentClass documentClass)
|
||||
throws DisqualifiedException, URISyntaxException {
|
||||
|
||||
String documentBody = crawledDocument.documentBody;
|
||||
@ -140,8 +140,9 @@ public class HtmlDocumentProcessorPlugin extends AbstractDocumentProcessorPlugin
|
||||
|
||||
// don't move this up! it uses title and quality
|
||||
// and is run before the heavy computations below
|
||||
documentLengthLogic.validateLength(dld, specialization.lengthModifier());
|
||||
if (isDisqualified(url, ret)) {
|
||||
documentLengthLogic.validateLength(dld, specialization.lengthModifier() * documentClass.lengthLimitModifier());
|
||||
|
||||
if (isDisqualified(documentClass, url, ret)) {
|
||||
throw new DisqualifiedException(DisqualificationReason.QUALITY);
|
||||
}
|
||||
|
||||
@ -205,9 +206,11 @@ public class HtmlDocumentProcessorPlugin extends AbstractDocumentProcessorPlugin
|
||||
|
||||
private static final GuardedRegex mastodonFeedRegex = GuardedRegexFactory.startsWith("/@", "^/@[^/]+/?$");
|
||||
|
||||
private boolean isDisqualified(EdgeUrl url, ProcessedDocumentDetails ret) {
|
||||
private boolean isDisqualified(DocumentClass documentClass, EdgeUrl url, ProcessedDocumentDetails ret) {
|
||||
|
||||
if (ret.quality < minDocumentQuality) {
|
||||
if (documentClass.enforceQualityLimits()
|
||||
&& ret.quality < minDocumentQuality)
|
||||
{
|
||||
return true;
|
||||
}
|
||||
|
||||
|
@ -3,6 +3,7 @@ package nu.marginalia.converting.processor.plugin;
|
||||
import com.google.inject.Inject;
|
||||
import com.google.inject.name.Named;
|
||||
import nu.marginalia.converting.language.LanguageFilter;
|
||||
import nu.marginalia.converting.processor.DocumentClass;
|
||||
import nu.marginalia.converting.processor.logic.DocumentLengthLogic;
|
||||
import nu.marginalia.crawling.model.CrawledDocument;
|
||||
import nu.marginalia.keyword.DocumentKeywordExtractor;
|
||||
@ -57,7 +58,7 @@ public class PlainTextDocumentProcessorPlugin extends AbstractDocumentProcessorP
|
||||
}
|
||||
|
||||
@Override
|
||||
public DetailsWithWords createDetails(CrawledDocument crawledDocument)
|
||||
public DetailsWithWords createDetails(CrawledDocument crawledDocument, DocumentClass documentClass)
|
||||
throws DisqualifiedException, URISyntaxException {
|
||||
|
||||
String documentBody = crawledDocument.documentBody;
|
||||
|
@ -5,6 +5,7 @@ import com.google.inject.Singleton;
|
||||
import nu.marginalia.atags.model.DomainLinks;
|
||||
import nu.marginalia.converting.model.GeneratorType;
|
||||
import nu.marginalia.converting.model.ProcessedDocument;
|
||||
import nu.marginalia.converting.processor.DocumentClass;
|
||||
import nu.marginalia.converting.processor.plugin.HtmlDocumentProcessorPlugin;
|
||||
import nu.marginalia.crawling.model.CrawledDocument;
|
||||
import nu.marginalia.model.EdgeUrl;
|
||||
@ -54,7 +55,7 @@ public class SideloaderProcessing {
|
||||
|
||||
var ret = new ProcessedDocument();
|
||||
try {
|
||||
var details = htmlProcessorPlugin.createDetails(crawledDoc);
|
||||
var details = htmlProcessorPlugin.createDetails(crawledDoc, DocumentClass.NORMAL);
|
||||
|
||||
ret.words = details.words();
|
||||
|
||||
|
Loading…
Reference in New Issue
Block a user