From 20970a6161d92b2059e697d29b89f573e26e07d6 Mon Sep 17 00:00:00 2001 From: vlofgren Date: Thu, 14 Jul 2022 12:37:06 +0200 Subject: [PATCH] Make processor more lenient toward quality, accept content-types which specify charset --- .../wmsa/edge/converting/ConverterModule.java | 2 +- .../processor/DocumentProcessor.java | 14 +++++++- .../converting/processor/DomainProcessor.java | 32 +++++++++++++++++-- .../processor/InstructionsCompiler.java | 15 +++++---- .../processor/logic/DocumentValuator.java | 14 +++----- 5 files changed, 57 insertions(+), 20 deletions(-) diff --git a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/converting/ConverterModule.java b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/converting/ConverterModule.java index 4bf6eaea..1177c1a7 100644 --- a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/converting/ConverterModule.java +++ b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/converting/ConverterModule.java @@ -11,7 +11,6 @@ import nu.marginalia.wmsa.edge.model.EdgeDomain; import nu.marginalia.wmsa.edge.model.EdgeUrl; import java.net.URISyntaxException; -import java.nio.file.Path; public class ConverterModule extends AbstractModule { @@ -27,6 +26,7 @@ public class ConverterModule extends AbstractModule { bind(Gson.class).toInstance(createGson()); bind(Double.class).annotatedWith(Names.named("min-document-quality")).toInstance(-15.); + bind(Double.class).annotatedWith(Names.named("min-avg-document-quality")).toInstance(-25.); bind(Integer.class).annotatedWith(Names.named("min-document-length")).toInstance(100); bind(Integer.class).annotatedWith(Names.named("max-title-length")).toInstance(128); bind(Integer.class).annotatedWith(Names.named("max-summary-length")).toInstance(255); diff --git a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/converting/processor/DocumentProcessor.java b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/converting/processor/DocumentProcessor.java index d04415fd..618e5efb 100644 --- a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/converting/processor/DocumentProcessor.java +++ b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/converting/processor/DocumentProcessor.java @@ -113,7 +113,19 @@ public class DocumentProcessor { } private boolean isAcceptedContentType(CrawledDocument crawledDocument) { - return crawledDocument.contentType != null && acceptedContentTypes.contains(crawledDocument.contentType.toLowerCase()); + if (crawledDocument.contentType == null) { + return false; + } + + var ct = crawledDocument.contentType; + + if (acceptedContentTypes.contains(ct)) + return true; + + if (ct.contains(";")) { + return acceptedContentTypes.contains(ct.substring(0, ct.indexOf(';'))); + } + return false; } private EdgeUrlState crawlerStatusToUrlState(String crawlerStatus, int httpStatus) { diff --git a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/converting/processor/DomainProcessor.java b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/converting/processor/DomainProcessor.java index 4343b0c3..b8b53f9d 100644 --- a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/converting/processor/DomainProcessor.java +++ b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/converting/processor/DomainProcessor.java @@ -1,21 +1,29 @@ package nu.marginalia.wmsa.edge.converting.processor; import com.google.inject.Inject; +import com.google.inject.name.Named; +import nu.marginalia.wmsa.edge.converting.model.ProcessedDocument; import nu.marginalia.wmsa.edge.converting.model.ProcessedDomain; import nu.marginalia.wmsa.edge.crawling.model.CrawledDomain; import nu.marginalia.wmsa.edge.crawling.model.CrawlerDomainStatus; import nu.marginalia.wmsa.edge.model.EdgeDomain; import nu.marginalia.wmsa.edge.model.crawl.EdgeDomainIndexingState; +import nu.marginalia.wmsa.edge.model.crawl.EdgeUrlState; import java.util.ArrayList; import java.util.Collections; +import java.util.List; public class DomainProcessor { private final DocumentProcessor documentProcessor; + private final Double minAvgDocumentQuality; @Inject - public DomainProcessor(DocumentProcessor documentProcessor) { + public DomainProcessor(DocumentProcessor documentProcessor, + @Named("min-avg-document-quality") Double minAvgDocumentQuality + ) { this.documentProcessor = documentProcessor; + this.minAvgDocumentQuality = minAvgDocumentQuality; } public ProcessedDomain process(CrawledDomain crawledDomain) { @@ -37,17 +45,37 @@ public class DomainProcessor { ret.documents.add(processedDoc); } } - } else { ret.documents = Collections.emptyList(); } + double averageQuality = getAverageQuality(ret.documents); + if (averageQuality < minAvgDocumentQuality) { + ret.documents.forEach(doc -> doc.state = EdgeUrlState.DISQUALIFIED); + } + ret.state = getState(crawledDomain.crawlerStatus); return ret; } + private double getAverageQuality(List documents) { + int n = 0; + double q = 0.; + for (var doc : documents) { + if (doc.quality().isPresent()) { + n++; + q += doc.quality().getAsDouble(); + } + } + + if (n > 0) { + return q / n; + } + return -5.; + } + private EdgeDomainIndexingState getState(String crawlerStatus) { return switch (CrawlerDomainStatus.valueOf(crawlerStatus)) { case OK -> EdgeDomainIndexingState.ACTIVE; diff --git a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/converting/processor/InstructionsCompiler.java b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/converting/processor/InstructionsCompiler.java index b75de436..07f1705a 100644 --- a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/converting/processor/InstructionsCompiler.java +++ b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/converting/processor/InstructionsCompiler.java @@ -42,15 +42,16 @@ public class InstructionsCompiler { Set seenUrls = new HashSet<>(documents.size()*4); Set seenDomains = new HashSet<>(documents.size()); - documents.stream().map(doc -> doc.url).forEach(seenUrls::add); - for (var doc : documents) { - if (doc.details == null) continue; - for (var url : doc.details.linksExternal) { - seenDomains.add(url.domain); + seenUrls.add(doc.url); + + if (doc.details != null) { + for (var url : doc.details.linksExternal) { + seenDomains.add(url.domain); + } + seenUrls.addAll(doc.details.linksExternal); + seenUrls.addAll(doc.details.linksInternal); } - seenUrls.addAll(doc.details.linksExternal); - seenUrls.addAll(doc.details.linksInternal); } ret.add(new LoadDomain(seenDomains.toArray(EdgeDomain[]::new))); diff --git a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/converting/processor/logic/DocumentValuator.java b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/converting/processor/logic/DocumentValuator.java index 6f015ef6..b0423efa 100644 --- a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/converting/processor/logic/DocumentValuator.java +++ b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/converting/processor/logic/DocumentValuator.java @@ -1,8 +1,8 @@ package nu.marginalia.wmsa.edge.converting.processor.logic; import crawlercommons.utils.Strings; -import nu.marginalia.wmsa.edge.converting.model.DisqualifiedException; import nu.marginalia.util.language.processing.model.DocumentLanguageData; +import nu.marginalia.wmsa.edge.converting.model.DisqualifiedException; import nu.marginalia.wmsa.edge.model.crawl.EdgeHtmlStandard; import org.jsoup.nodes.Document; @@ -35,7 +35,7 @@ public class DocumentValuator { throw new DisqualifiedException(LENGTH); } - return Math.log(textBodyLength / (double) rawLength)*htmlStandard.scale + return Math.log(textBodyLength / (double) (1+rawLength))*htmlStandard.scale + htmlStandard.offset - scriptPenalty - smutCoefficient; @@ -52,17 +52,13 @@ public class DocumentValuator { double scriptPenalty = 0; for (var tag : scriptTags) { - String srcTag = tag.attr("src"); - if (Strings.isBlank(srcTag)) { - scriptPenalty += 1; - } - else if (srcTag.contains("wp-content") || srcTag.contains("wp-includes") || srcTag.contains("jquery")) { + String srcAttr = tag.attr("src"); + if (srcAttr.contains("wp-content") || srcAttr.contains("wp-includes") || srcAttr.contains("jquery")) { scriptPenalty += 0.49; } - else { + else if (!Strings.isBlank(srcAttr)) { scriptPenalty += 1; } - } return (int)(scriptPenalty + badScript + (scriptText.length())/1000.); }