mirror of
https://github.com/MarginaliaSearch/MarginaliaSearch.git
synced 2025-02-23 21:18:58 +00:00
Make processor more lenient toward quality, accept content-types which specify charset
This commit is contained in:
parent
e9a270c015
commit
20970a6161
@ -11,7 +11,6 @@ import nu.marginalia.wmsa.edge.model.EdgeDomain;
|
|||||||
import nu.marginalia.wmsa.edge.model.EdgeUrl;
|
import nu.marginalia.wmsa.edge.model.EdgeUrl;
|
||||||
|
|
||||||
import java.net.URISyntaxException;
|
import java.net.URISyntaxException;
|
||||||
import java.nio.file.Path;
|
|
||||||
|
|
||||||
public class ConverterModule extends AbstractModule {
|
public class ConverterModule extends AbstractModule {
|
||||||
|
|
||||||
@ -27,6 +26,7 @@ public class ConverterModule extends AbstractModule {
|
|||||||
bind(Gson.class).toInstance(createGson());
|
bind(Gson.class).toInstance(createGson());
|
||||||
|
|
||||||
bind(Double.class).annotatedWith(Names.named("min-document-quality")).toInstance(-15.);
|
bind(Double.class).annotatedWith(Names.named("min-document-quality")).toInstance(-15.);
|
||||||
|
bind(Double.class).annotatedWith(Names.named("min-avg-document-quality")).toInstance(-25.);
|
||||||
bind(Integer.class).annotatedWith(Names.named("min-document-length")).toInstance(100);
|
bind(Integer.class).annotatedWith(Names.named("min-document-length")).toInstance(100);
|
||||||
bind(Integer.class).annotatedWith(Names.named("max-title-length")).toInstance(128);
|
bind(Integer.class).annotatedWith(Names.named("max-title-length")).toInstance(128);
|
||||||
bind(Integer.class).annotatedWith(Names.named("max-summary-length")).toInstance(255);
|
bind(Integer.class).annotatedWith(Names.named("max-summary-length")).toInstance(255);
|
||||||
|
@ -113,7 +113,19 @@ public class DocumentProcessor {
|
|||||||
}
|
}
|
||||||
|
|
||||||
private boolean isAcceptedContentType(CrawledDocument crawledDocument) {
|
private boolean isAcceptedContentType(CrawledDocument crawledDocument) {
|
||||||
return crawledDocument.contentType != null && acceptedContentTypes.contains(crawledDocument.contentType.toLowerCase());
|
if (crawledDocument.contentType == null) {
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
|
||||||
|
var ct = crawledDocument.contentType;
|
||||||
|
|
||||||
|
if (acceptedContentTypes.contains(ct))
|
||||||
|
return true;
|
||||||
|
|
||||||
|
if (ct.contains(";")) {
|
||||||
|
return acceptedContentTypes.contains(ct.substring(0, ct.indexOf(';')));
|
||||||
|
}
|
||||||
|
return false;
|
||||||
}
|
}
|
||||||
|
|
||||||
private EdgeUrlState crawlerStatusToUrlState(String crawlerStatus, int httpStatus) {
|
private EdgeUrlState crawlerStatusToUrlState(String crawlerStatus, int httpStatus) {
|
||||||
|
@ -1,21 +1,29 @@
|
|||||||
package nu.marginalia.wmsa.edge.converting.processor;
|
package nu.marginalia.wmsa.edge.converting.processor;
|
||||||
|
|
||||||
import com.google.inject.Inject;
|
import com.google.inject.Inject;
|
||||||
|
import com.google.inject.name.Named;
|
||||||
|
import nu.marginalia.wmsa.edge.converting.model.ProcessedDocument;
|
||||||
import nu.marginalia.wmsa.edge.converting.model.ProcessedDomain;
|
import nu.marginalia.wmsa.edge.converting.model.ProcessedDomain;
|
||||||
import nu.marginalia.wmsa.edge.crawling.model.CrawledDomain;
|
import nu.marginalia.wmsa.edge.crawling.model.CrawledDomain;
|
||||||
import nu.marginalia.wmsa.edge.crawling.model.CrawlerDomainStatus;
|
import nu.marginalia.wmsa.edge.crawling.model.CrawlerDomainStatus;
|
||||||
import nu.marginalia.wmsa.edge.model.EdgeDomain;
|
import nu.marginalia.wmsa.edge.model.EdgeDomain;
|
||||||
import nu.marginalia.wmsa.edge.model.crawl.EdgeDomainIndexingState;
|
import nu.marginalia.wmsa.edge.model.crawl.EdgeDomainIndexingState;
|
||||||
|
import nu.marginalia.wmsa.edge.model.crawl.EdgeUrlState;
|
||||||
|
|
||||||
import java.util.ArrayList;
|
import java.util.ArrayList;
|
||||||
import java.util.Collections;
|
import java.util.Collections;
|
||||||
|
import java.util.List;
|
||||||
|
|
||||||
public class DomainProcessor {
|
public class DomainProcessor {
|
||||||
private final DocumentProcessor documentProcessor;
|
private final DocumentProcessor documentProcessor;
|
||||||
|
private final Double minAvgDocumentQuality;
|
||||||
|
|
||||||
@Inject
|
@Inject
|
||||||
public DomainProcessor(DocumentProcessor documentProcessor) {
|
public DomainProcessor(DocumentProcessor documentProcessor,
|
||||||
|
@Named("min-avg-document-quality") Double minAvgDocumentQuality
|
||||||
|
) {
|
||||||
this.documentProcessor = documentProcessor;
|
this.documentProcessor = documentProcessor;
|
||||||
|
this.minAvgDocumentQuality = minAvgDocumentQuality;
|
||||||
}
|
}
|
||||||
|
|
||||||
public ProcessedDomain process(CrawledDomain crawledDomain) {
|
public ProcessedDomain process(CrawledDomain crawledDomain) {
|
||||||
@ -37,17 +45,37 @@ public class DomainProcessor {
|
|||||||
ret.documents.add(processedDoc);
|
ret.documents.add(processedDoc);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
}
|
}
|
||||||
else {
|
else {
|
||||||
ret.documents = Collections.emptyList();
|
ret.documents = Collections.emptyList();
|
||||||
}
|
}
|
||||||
|
|
||||||
|
double averageQuality = getAverageQuality(ret.documents);
|
||||||
|
if (averageQuality < minAvgDocumentQuality) {
|
||||||
|
ret.documents.forEach(doc -> doc.state = EdgeUrlState.DISQUALIFIED);
|
||||||
|
}
|
||||||
|
|
||||||
ret.state = getState(crawledDomain.crawlerStatus);
|
ret.state = getState(crawledDomain.crawlerStatus);
|
||||||
|
|
||||||
return ret;
|
return ret;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
private double getAverageQuality(List<ProcessedDocument> documents) {
|
||||||
|
int n = 0;
|
||||||
|
double q = 0.;
|
||||||
|
for (var doc : documents) {
|
||||||
|
if (doc.quality().isPresent()) {
|
||||||
|
n++;
|
||||||
|
q += doc.quality().getAsDouble();
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
if (n > 0) {
|
||||||
|
return q / n;
|
||||||
|
}
|
||||||
|
return -5.;
|
||||||
|
}
|
||||||
|
|
||||||
private EdgeDomainIndexingState getState(String crawlerStatus) {
|
private EdgeDomainIndexingState getState(String crawlerStatus) {
|
||||||
return switch (CrawlerDomainStatus.valueOf(crawlerStatus)) {
|
return switch (CrawlerDomainStatus.valueOf(crawlerStatus)) {
|
||||||
case OK -> EdgeDomainIndexingState.ACTIVE;
|
case OK -> EdgeDomainIndexingState.ACTIVE;
|
||||||
|
@ -42,15 +42,16 @@ public class InstructionsCompiler {
|
|||||||
Set<EdgeUrl> seenUrls = new HashSet<>(documents.size()*4);
|
Set<EdgeUrl> seenUrls = new HashSet<>(documents.size()*4);
|
||||||
Set<EdgeDomain> seenDomains = new HashSet<>(documents.size());
|
Set<EdgeDomain> seenDomains = new HashSet<>(documents.size());
|
||||||
|
|
||||||
documents.stream().map(doc -> doc.url).forEach(seenUrls::add);
|
|
||||||
|
|
||||||
for (var doc : documents) {
|
for (var doc : documents) {
|
||||||
if (doc.details == null) continue;
|
seenUrls.add(doc.url);
|
||||||
for (var url : doc.details.linksExternal) {
|
|
||||||
seenDomains.add(url.domain);
|
if (doc.details != null) {
|
||||||
|
for (var url : doc.details.linksExternal) {
|
||||||
|
seenDomains.add(url.domain);
|
||||||
|
}
|
||||||
|
seenUrls.addAll(doc.details.linksExternal);
|
||||||
|
seenUrls.addAll(doc.details.linksInternal);
|
||||||
}
|
}
|
||||||
seenUrls.addAll(doc.details.linksExternal);
|
|
||||||
seenUrls.addAll(doc.details.linksInternal);
|
|
||||||
}
|
}
|
||||||
|
|
||||||
ret.add(new LoadDomain(seenDomains.toArray(EdgeDomain[]::new)));
|
ret.add(new LoadDomain(seenDomains.toArray(EdgeDomain[]::new)));
|
||||||
|
@ -1,8 +1,8 @@
|
|||||||
package nu.marginalia.wmsa.edge.converting.processor.logic;
|
package nu.marginalia.wmsa.edge.converting.processor.logic;
|
||||||
|
|
||||||
import crawlercommons.utils.Strings;
|
import crawlercommons.utils.Strings;
|
||||||
import nu.marginalia.wmsa.edge.converting.model.DisqualifiedException;
|
|
||||||
import nu.marginalia.util.language.processing.model.DocumentLanguageData;
|
import nu.marginalia.util.language.processing.model.DocumentLanguageData;
|
||||||
|
import nu.marginalia.wmsa.edge.converting.model.DisqualifiedException;
|
||||||
import nu.marginalia.wmsa.edge.model.crawl.EdgeHtmlStandard;
|
import nu.marginalia.wmsa.edge.model.crawl.EdgeHtmlStandard;
|
||||||
import org.jsoup.nodes.Document;
|
import org.jsoup.nodes.Document;
|
||||||
|
|
||||||
@ -35,7 +35,7 @@ public class DocumentValuator {
|
|||||||
throw new DisqualifiedException(LENGTH);
|
throw new DisqualifiedException(LENGTH);
|
||||||
}
|
}
|
||||||
|
|
||||||
return Math.log(textBodyLength / (double) rawLength)*htmlStandard.scale
|
return Math.log(textBodyLength / (double) (1+rawLength))*htmlStandard.scale
|
||||||
+ htmlStandard.offset
|
+ htmlStandard.offset
|
||||||
- scriptPenalty
|
- scriptPenalty
|
||||||
- smutCoefficient;
|
- smutCoefficient;
|
||||||
@ -52,17 +52,13 @@ public class DocumentValuator {
|
|||||||
|
|
||||||
double scriptPenalty = 0;
|
double scriptPenalty = 0;
|
||||||
for (var tag : scriptTags) {
|
for (var tag : scriptTags) {
|
||||||
String srcTag = tag.attr("src");
|
String srcAttr = tag.attr("src");
|
||||||
if (Strings.isBlank(srcTag)) {
|
if (srcAttr.contains("wp-content") || srcAttr.contains("wp-includes") || srcAttr.contains("jquery")) {
|
||||||
scriptPenalty += 1;
|
|
||||||
}
|
|
||||||
else if (srcTag.contains("wp-content") || srcTag.contains("wp-includes") || srcTag.contains("jquery")) {
|
|
||||||
scriptPenalty += 0.49;
|
scriptPenalty += 0.49;
|
||||||
}
|
}
|
||||||
else {
|
else if (!Strings.isBlank(srcAttr)) {
|
||||||
scriptPenalty += 1;
|
scriptPenalty += 1;
|
||||||
}
|
}
|
||||||
|
|
||||||
}
|
}
|
||||||
return (int)(scriptPenalty + badScript + (scriptText.length())/1000.);
|
return (int)(scriptPenalty + badScript + (scriptText.length())/1000.);
|
||||||
}
|
}
|
||||||
|
Loading…
Reference in New Issue
Block a user