mirror of
https://github.com/MarginaliaSearch/MarginaliaSearch.git
synced 2025-02-23 13:09:00 +00:00
Make processor more lenient toward quality, accept content-types which specify charset
This commit is contained in:
parent
e9a270c015
commit
20970a6161
@ -11,7 +11,6 @@ import nu.marginalia.wmsa.edge.model.EdgeDomain;
|
||||
import nu.marginalia.wmsa.edge.model.EdgeUrl;
|
||||
|
||||
import java.net.URISyntaxException;
|
||||
import java.nio.file.Path;
|
||||
|
||||
public class ConverterModule extends AbstractModule {
|
||||
|
||||
@ -27,6 +26,7 @@ public class ConverterModule extends AbstractModule {
|
||||
bind(Gson.class).toInstance(createGson());
|
||||
|
||||
bind(Double.class).annotatedWith(Names.named("min-document-quality")).toInstance(-15.);
|
||||
bind(Double.class).annotatedWith(Names.named("min-avg-document-quality")).toInstance(-25.);
|
||||
bind(Integer.class).annotatedWith(Names.named("min-document-length")).toInstance(100);
|
||||
bind(Integer.class).annotatedWith(Names.named("max-title-length")).toInstance(128);
|
||||
bind(Integer.class).annotatedWith(Names.named("max-summary-length")).toInstance(255);
|
||||
|
@ -113,7 +113,19 @@ public class DocumentProcessor {
|
||||
}
|
||||
|
||||
private boolean isAcceptedContentType(CrawledDocument crawledDocument) {
|
||||
return crawledDocument.contentType != null && acceptedContentTypes.contains(crawledDocument.contentType.toLowerCase());
|
||||
if (crawledDocument.contentType == null) {
|
||||
return false;
|
||||
}
|
||||
|
||||
var ct = crawledDocument.contentType;
|
||||
|
||||
if (acceptedContentTypes.contains(ct))
|
||||
return true;
|
||||
|
||||
if (ct.contains(";")) {
|
||||
return acceptedContentTypes.contains(ct.substring(0, ct.indexOf(';')));
|
||||
}
|
||||
return false;
|
||||
}
|
||||
|
||||
private EdgeUrlState crawlerStatusToUrlState(String crawlerStatus, int httpStatus) {
|
||||
|
@ -1,21 +1,29 @@
|
||||
package nu.marginalia.wmsa.edge.converting.processor;
|
||||
|
||||
import com.google.inject.Inject;
|
||||
import com.google.inject.name.Named;
|
||||
import nu.marginalia.wmsa.edge.converting.model.ProcessedDocument;
|
||||
import nu.marginalia.wmsa.edge.converting.model.ProcessedDomain;
|
||||
import nu.marginalia.wmsa.edge.crawling.model.CrawledDomain;
|
||||
import nu.marginalia.wmsa.edge.crawling.model.CrawlerDomainStatus;
|
||||
import nu.marginalia.wmsa.edge.model.EdgeDomain;
|
||||
import nu.marginalia.wmsa.edge.model.crawl.EdgeDomainIndexingState;
|
||||
import nu.marginalia.wmsa.edge.model.crawl.EdgeUrlState;
|
||||
|
||||
import java.util.ArrayList;
|
||||
import java.util.Collections;
|
||||
import java.util.List;
|
||||
|
||||
public class DomainProcessor {
|
||||
private final DocumentProcessor documentProcessor;
|
||||
private final Double minAvgDocumentQuality;
|
||||
|
||||
@Inject
|
||||
public DomainProcessor(DocumentProcessor documentProcessor) {
|
||||
public DomainProcessor(DocumentProcessor documentProcessor,
|
||||
@Named("min-avg-document-quality") Double minAvgDocumentQuality
|
||||
) {
|
||||
this.documentProcessor = documentProcessor;
|
||||
this.minAvgDocumentQuality = minAvgDocumentQuality;
|
||||
}
|
||||
|
||||
public ProcessedDomain process(CrawledDomain crawledDomain) {
|
||||
@ -37,17 +45,37 @@ public class DomainProcessor {
|
||||
ret.documents.add(processedDoc);
|
||||
}
|
||||
}
|
||||
|
||||
}
|
||||
else {
|
||||
ret.documents = Collections.emptyList();
|
||||
}
|
||||
|
||||
double averageQuality = getAverageQuality(ret.documents);
|
||||
if (averageQuality < minAvgDocumentQuality) {
|
||||
ret.documents.forEach(doc -> doc.state = EdgeUrlState.DISQUALIFIED);
|
||||
}
|
||||
|
||||
ret.state = getState(crawledDomain.crawlerStatus);
|
||||
|
||||
return ret;
|
||||
}
|
||||
|
||||
private double getAverageQuality(List<ProcessedDocument> documents) {
|
||||
int n = 0;
|
||||
double q = 0.;
|
||||
for (var doc : documents) {
|
||||
if (doc.quality().isPresent()) {
|
||||
n++;
|
||||
q += doc.quality().getAsDouble();
|
||||
}
|
||||
}
|
||||
|
||||
if (n > 0) {
|
||||
return q / n;
|
||||
}
|
||||
return -5.;
|
||||
}
|
||||
|
||||
private EdgeDomainIndexingState getState(String crawlerStatus) {
|
||||
return switch (CrawlerDomainStatus.valueOf(crawlerStatus)) {
|
||||
case OK -> EdgeDomainIndexingState.ACTIVE;
|
||||
|
@ -42,15 +42,16 @@ public class InstructionsCompiler {
|
||||
Set<EdgeUrl> seenUrls = new HashSet<>(documents.size()*4);
|
||||
Set<EdgeDomain> seenDomains = new HashSet<>(documents.size());
|
||||
|
||||
documents.stream().map(doc -> doc.url).forEach(seenUrls::add);
|
||||
|
||||
for (var doc : documents) {
|
||||
if (doc.details == null) continue;
|
||||
for (var url : doc.details.linksExternal) {
|
||||
seenDomains.add(url.domain);
|
||||
seenUrls.add(doc.url);
|
||||
|
||||
if (doc.details != null) {
|
||||
for (var url : doc.details.linksExternal) {
|
||||
seenDomains.add(url.domain);
|
||||
}
|
||||
seenUrls.addAll(doc.details.linksExternal);
|
||||
seenUrls.addAll(doc.details.linksInternal);
|
||||
}
|
||||
seenUrls.addAll(doc.details.linksExternal);
|
||||
seenUrls.addAll(doc.details.linksInternal);
|
||||
}
|
||||
|
||||
ret.add(new LoadDomain(seenDomains.toArray(EdgeDomain[]::new)));
|
||||
|
@ -1,8 +1,8 @@
|
||||
package nu.marginalia.wmsa.edge.converting.processor.logic;
|
||||
|
||||
import crawlercommons.utils.Strings;
|
||||
import nu.marginalia.wmsa.edge.converting.model.DisqualifiedException;
|
||||
import nu.marginalia.util.language.processing.model.DocumentLanguageData;
|
||||
import nu.marginalia.wmsa.edge.converting.model.DisqualifiedException;
|
||||
import nu.marginalia.wmsa.edge.model.crawl.EdgeHtmlStandard;
|
||||
import org.jsoup.nodes.Document;
|
||||
|
||||
@ -35,7 +35,7 @@ public class DocumentValuator {
|
||||
throw new DisqualifiedException(LENGTH);
|
||||
}
|
||||
|
||||
return Math.log(textBodyLength / (double) rawLength)*htmlStandard.scale
|
||||
return Math.log(textBodyLength / (double) (1+rawLength))*htmlStandard.scale
|
||||
+ htmlStandard.offset
|
||||
- scriptPenalty
|
||||
- smutCoefficient;
|
||||
@ -52,17 +52,13 @@ public class DocumentValuator {
|
||||
|
||||
double scriptPenalty = 0;
|
||||
for (var tag : scriptTags) {
|
||||
String srcTag = tag.attr("src");
|
||||
if (Strings.isBlank(srcTag)) {
|
||||
scriptPenalty += 1;
|
||||
}
|
||||
else if (srcTag.contains("wp-content") || srcTag.contains("wp-includes") || srcTag.contains("jquery")) {
|
||||
String srcAttr = tag.attr("src");
|
||||
if (srcAttr.contains("wp-content") || srcAttr.contains("wp-includes") || srcAttr.contains("jquery")) {
|
||||
scriptPenalty += 0.49;
|
||||
}
|
||||
else {
|
||||
else if (!Strings.isBlank(srcAttr)) {
|
||||
scriptPenalty += 1;
|
||||
}
|
||||
|
||||
}
|
||||
return (int)(scriptPenalty + badScript + (scriptText.length())/1000.);
|
||||
}
|
||||
|
Loading…
Reference in New Issue
Block a user