Make processor more lenient toward quality, accept content-types which specify charset

This commit is contained in:
vlofgren 2022-07-14 12:37:06 +02:00
parent e9a270c015
commit 20970a6161
5 changed files with 57 additions and 20 deletions

View File

@ -11,7 +11,6 @@ import nu.marginalia.wmsa.edge.model.EdgeDomain;
import nu.marginalia.wmsa.edge.model.EdgeUrl;
import java.net.URISyntaxException;
import java.nio.file.Path;
public class ConverterModule extends AbstractModule {
@ -27,6 +26,7 @@ public class ConverterModule extends AbstractModule {
bind(Gson.class).toInstance(createGson());
bind(Double.class).annotatedWith(Names.named("min-document-quality")).toInstance(-15.);
bind(Double.class).annotatedWith(Names.named("min-avg-document-quality")).toInstance(-25.);
bind(Integer.class).annotatedWith(Names.named("min-document-length")).toInstance(100);
bind(Integer.class).annotatedWith(Names.named("max-title-length")).toInstance(128);
bind(Integer.class).annotatedWith(Names.named("max-summary-length")).toInstance(255);

View File

@ -113,7 +113,19 @@ public class DocumentProcessor {
}
private boolean isAcceptedContentType(CrawledDocument crawledDocument) {
return crawledDocument.contentType != null && acceptedContentTypes.contains(crawledDocument.contentType.toLowerCase());
if (crawledDocument.contentType == null) {
return false;
}
var ct = crawledDocument.contentType;
if (acceptedContentTypes.contains(ct))
return true;
if (ct.contains(";")) {
return acceptedContentTypes.contains(ct.substring(0, ct.indexOf(';')));
}
return false;
}
private EdgeUrlState crawlerStatusToUrlState(String crawlerStatus, int httpStatus) {

View File

@ -1,21 +1,29 @@
package nu.marginalia.wmsa.edge.converting.processor;
import com.google.inject.Inject;
import com.google.inject.name.Named;
import nu.marginalia.wmsa.edge.converting.model.ProcessedDocument;
import nu.marginalia.wmsa.edge.converting.model.ProcessedDomain;
import nu.marginalia.wmsa.edge.crawling.model.CrawledDomain;
import nu.marginalia.wmsa.edge.crawling.model.CrawlerDomainStatus;
import nu.marginalia.wmsa.edge.model.EdgeDomain;
import nu.marginalia.wmsa.edge.model.crawl.EdgeDomainIndexingState;
import nu.marginalia.wmsa.edge.model.crawl.EdgeUrlState;
import java.util.ArrayList;
import java.util.Collections;
import java.util.List;
public class DomainProcessor {
private final DocumentProcessor documentProcessor;
private final Double minAvgDocumentQuality;
@Inject
public DomainProcessor(DocumentProcessor documentProcessor) {
public DomainProcessor(DocumentProcessor documentProcessor,
@Named("min-avg-document-quality") Double minAvgDocumentQuality
) {
this.documentProcessor = documentProcessor;
this.minAvgDocumentQuality = minAvgDocumentQuality;
}
public ProcessedDomain process(CrawledDomain crawledDomain) {
@ -37,17 +45,37 @@ public class DomainProcessor {
ret.documents.add(processedDoc);
}
}
}
else {
ret.documents = Collections.emptyList();
}
double averageQuality = getAverageQuality(ret.documents);
if (averageQuality < minAvgDocumentQuality) {
ret.documents.forEach(doc -> doc.state = EdgeUrlState.DISQUALIFIED);
}
ret.state = getState(crawledDomain.crawlerStatus);
return ret;
}
private double getAverageQuality(List<ProcessedDocument> documents) {
int n = 0;
double q = 0.;
for (var doc : documents) {
if (doc.quality().isPresent()) {
n++;
q += doc.quality().getAsDouble();
}
}
if (n > 0) {
return q / n;
}
return -5.;
}
private EdgeDomainIndexingState getState(String crawlerStatus) {
return switch (CrawlerDomainStatus.valueOf(crawlerStatus)) {
case OK -> EdgeDomainIndexingState.ACTIVE;

View File

@ -42,15 +42,16 @@ public class InstructionsCompiler {
Set<EdgeUrl> seenUrls = new HashSet<>(documents.size()*4);
Set<EdgeDomain> seenDomains = new HashSet<>(documents.size());
documents.stream().map(doc -> doc.url).forEach(seenUrls::add);
for (var doc : documents) {
if (doc.details == null) continue;
for (var url : doc.details.linksExternal) {
seenDomains.add(url.domain);
seenUrls.add(doc.url);
if (doc.details != null) {
for (var url : doc.details.linksExternal) {
seenDomains.add(url.domain);
}
seenUrls.addAll(doc.details.linksExternal);
seenUrls.addAll(doc.details.linksInternal);
}
seenUrls.addAll(doc.details.linksExternal);
seenUrls.addAll(doc.details.linksInternal);
}
ret.add(new LoadDomain(seenDomains.toArray(EdgeDomain[]::new)));

View File

@ -1,8 +1,8 @@
package nu.marginalia.wmsa.edge.converting.processor.logic;
import crawlercommons.utils.Strings;
import nu.marginalia.wmsa.edge.converting.model.DisqualifiedException;
import nu.marginalia.util.language.processing.model.DocumentLanguageData;
import nu.marginalia.wmsa.edge.converting.model.DisqualifiedException;
import nu.marginalia.wmsa.edge.model.crawl.EdgeHtmlStandard;
import org.jsoup.nodes.Document;
@ -35,7 +35,7 @@ public class DocumentValuator {
throw new DisqualifiedException(LENGTH);
}
return Math.log(textBodyLength / (double) rawLength)*htmlStandard.scale
return Math.log(textBodyLength / (double) (1+rawLength))*htmlStandard.scale
+ htmlStandard.offset
- scriptPenalty
- smutCoefficient;
@ -52,17 +52,13 @@ public class DocumentValuator {
double scriptPenalty = 0;
for (var tag : scriptTags) {
String srcTag = tag.attr("src");
if (Strings.isBlank(srcTag)) {
scriptPenalty += 1;
}
else if (srcTag.contains("wp-content") || srcTag.contains("wp-includes") || srcTag.contains("jquery")) {
String srcAttr = tag.attr("src");
if (srcAttr.contains("wp-content") || srcAttr.contains("wp-includes") || srcAttr.contains("jquery")) {
scriptPenalty += 0.49;
}
else {
else if (!Strings.isBlank(srcAttr)) {
scriptPenalty += 1;
}
}
return (int)(scriptPenalty + badScript + (scriptText.length())/1000.);
}