Block websites with "acceptable ads", as this seems a strong indicator the domain is either parked or spam.

This commit is contained in:
vlofgren 2022-07-08 16:50:00 +02:00
parent 7a4f5c27a6
commit 2b83e0d754
3 changed files with 41 additions and 9 deletions

View File

@ -12,6 +12,11 @@ public class DisqualifiedException extends Exception {
}
public enum DisqualificationReason {
LENGTH, CONTENT_TYPE, LANGUAGE, STATUS, QUALITY
LENGTH,
CONTENT_TYPE,
LANGUAGE,
STATUS,
QUALITY,
ACCEPTABLE_ADS
}
}

View File

@ -0,0 +1,22 @@
package nu.marginalia.wmsa.edge.converting.processor;
import nu.marginalia.wmsa.edge.crawling.model.CrawledDocument;
import org.jsoup.nodes.Document;
public class AcceptableAds {
/* Acceptable Ads is an initiative to allow less intrusive ads to punch through adblockers.
*
* In practice, from looking at crawled data, the only sites in the crawled corpus that seem to
* follow this standard are domain squatters and other nuisance sites.
*
*/
public static boolean hasAcceptableAdsTag(Document parsedDocument) {
return parsedDocument.getElementsByTag("html").hasAttr("data-adblockkey");
}
public static boolean hasAcceptableAdsHeader(CrawledDocument document) {
return document.headers.contains("X-Adblock-Key");
}
}

View File

@ -3,19 +3,15 @@ package nu.marginalia.wmsa.edge.converting.processor;
import com.google.common.hash.HashCode;
import com.google.inject.Inject;
import com.google.inject.name.Named;
import nu.marginalia.util.language.LanguageFilter;
import nu.marginalia.util.language.processing.DocumentKeywordExtractor;
import nu.marginalia.util.language.processing.SentenceExtractor;
import nu.marginalia.util.language.processing.model.DocumentLanguageData;
import nu.marginalia.wmsa.edge.converting.model.DisqualifiedException;
import nu.marginalia.wmsa.edge.converting.model.DisqualifiedException.DisqualificationReason;
import nu.marginalia.wmsa.edge.converting.model.ProcessedDocument;
import nu.marginalia.wmsa.edge.converting.model.ProcessedDocumentDetails;
import nu.marginalia.wmsa.edge.converting.processor.logic.*;
import nu.marginalia.wmsa.edge.converting.processor.logic.FeedExtractor;
import nu.marginalia.wmsa.edge.converting.processor.logic.LinkParser;
import nu.marginalia.util.language.LanguageFilter;
import nu.marginalia.util.language.processing.DocumentKeywordExtractor;
import nu.marginalia.util.language.processing.SentenceExtractor;
import nu.marginalia.util.language.processing.model.DocumentLanguageData;
import nu.marginalia.wmsa.edge.converting.processor.logic.HtmlFeature;
import nu.marginalia.wmsa.edge.converting.processor.logic.HtmlStandardExtractor;
import nu.marginalia.wmsa.edge.crawling.model.CrawledDocument;
import nu.marginalia.wmsa.edge.crawling.model.CrawledDomain;
import nu.marginalia.wmsa.edge.crawling.model.CrawlerDocumentStatus;
@ -81,6 +77,10 @@ public class DocumentProcessor {
if (ret.state == EdgeUrlState.OK) {
if (AcceptableAds.hasAcceptableAdsHeader(crawledDocument)) {
throw new DisqualifiedException(DisqualificationReason.ACCEPTABLE_ADS);
}
if (isAcceptedContentType(crawledDocument)) {
var detailsWords = createDetails(crawledDomain, crawledDocument);
@ -128,6 +128,11 @@ public class DocumentProcessor {
throws DisqualifiedException, URISyntaxException {
var doc = Jsoup.parse(crawledDocument.documentBody);
if (AcceptableAds.hasAcceptableAdsTag(doc)) {
throw new DisqualifiedException(DisqualificationReason.ACCEPTABLE_ADS);
}
var dld = sentenceExtractor.extractSentences(doc.clone());
checkDocumentLanguage(dld);