mirror of
https://github.com/MarginaliaSearch/MarginaliaSearch.git
synced 2025-02-23 13:09:00 +00:00
Block websites with "acceptable ads", as this seems a strong indicator the domain is either parked or spam.
This commit is contained in:
parent
7a4f5c27a6
commit
2b83e0d754
@ -12,6 +12,11 @@ public class DisqualifiedException extends Exception {
|
||||
}
|
||||
|
||||
public enum DisqualificationReason {
|
||||
LENGTH, CONTENT_TYPE, LANGUAGE, STATUS, QUALITY
|
||||
LENGTH,
|
||||
CONTENT_TYPE,
|
||||
LANGUAGE,
|
||||
STATUS,
|
||||
QUALITY,
|
||||
ACCEPTABLE_ADS
|
||||
}
|
||||
}
|
||||
|
@ -0,0 +1,22 @@
|
||||
package nu.marginalia.wmsa.edge.converting.processor;
|
||||
|
||||
import nu.marginalia.wmsa.edge.crawling.model.CrawledDocument;
|
||||
import org.jsoup.nodes.Document;
|
||||
|
||||
|
||||
public class AcceptableAds {
|
||||
/* Acceptable Ads is an initiative to allow less intrusive ads to punch through adblockers.
|
||||
*
|
||||
* In practice, from looking at crawled data, the only sites in the crawled corpus that seem to
|
||||
* follow this standard are domain squatters and other nuisance sites.
|
||||
*
|
||||
*/
|
||||
|
||||
public static boolean hasAcceptableAdsTag(Document parsedDocument) {
|
||||
return parsedDocument.getElementsByTag("html").hasAttr("data-adblockkey");
|
||||
}
|
||||
|
||||
public static boolean hasAcceptableAdsHeader(CrawledDocument document) {
|
||||
return document.headers.contains("X-Adblock-Key");
|
||||
}
|
||||
}
|
@ -3,19 +3,15 @@ package nu.marginalia.wmsa.edge.converting.processor;
|
||||
import com.google.common.hash.HashCode;
|
||||
import com.google.inject.Inject;
|
||||
import com.google.inject.name.Named;
|
||||
import nu.marginalia.util.language.LanguageFilter;
|
||||
import nu.marginalia.util.language.processing.DocumentKeywordExtractor;
|
||||
import nu.marginalia.util.language.processing.SentenceExtractor;
|
||||
import nu.marginalia.util.language.processing.model.DocumentLanguageData;
|
||||
import nu.marginalia.wmsa.edge.converting.model.DisqualifiedException;
|
||||
import nu.marginalia.wmsa.edge.converting.model.DisqualifiedException.DisqualificationReason;
|
||||
import nu.marginalia.wmsa.edge.converting.model.ProcessedDocument;
|
||||
import nu.marginalia.wmsa.edge.converting.model.ProcessedDocumentDetails;
|
||||
import nu.marginalia.wmsa.edge.converting.processor.logic.*;
|
||||
import nu.marginalia.wmsa.edge.converting.processor.logic.FeedExtractor;
|
||||
import nu.marginalia.wmsa.edge.converting.processor.logic.LinkParser;
|
||||
import nu.marginalia.util.language.LanguageFilter;
|
||||
import nu.marginalia.util.language.processing.DocumentKeywordExtractor;
|
||||
import nu.marginalia.util.language.processing.SentenceExtractor;
|
||||
import nu.marginalia.util.language.processing.model.DocumentLanguageData;
|
||||
import nu.marginalia.wmsa.edge.converting.processor.logic.HtmlFeature;
|
||||
import nu.marginalia.wmsa.edge.converting.processor.logic.HtmlStandardExtractor;
|
||||
import nu.marginalia.wmsa.edge.crawling.model.CrawledDocument;
|
||||
import nu.marginalia.wmsa.edge.crawling.model.CrawledDomain;
|
||||
import nu.marginalia.wmsa.edge.crawling.model.CrawlerDocumentStatus;
|
||||
@ -81,6 +77,10 @@ public class DocumentProcessor {
|
||||
|
||||
if (ret.state == EdgeUrlState.OK) {
|
||||
|
||||
if (AcceptableAds.hasAcceptableAdsHeader(crawledDocument)) {
|
||||
throw new DisqualifiedException(DisqualificationReason.ACCEPTABLE_ADS);
|
||||
}
|
||||
|
||||
if (isAcceptedContentType(crawledDocument)) {
|
||||
var detailsWords = createDetails(crawledDomain, crawledDocument);
|
||||
|
||||
@ -128,6 +128,11 @@ public class DocumentProcessor {
|
||||
throws DisqualifiedException, URISyntaxException {
|
||||
|
||||
var doc = Jsoup.parse(crawledDocument.documentBody);
|
||||
|
||||
if (AcceptableAds.hasAcceptableAdsTag(doc)) {
|
||||
throw new DisqualifiedException(DisqualificationReason.ACCEPTABLE_ADS);
|
||||
}
|
||||
|
||||
var dld = sentenceExtractor.extractSentences(doc.clone());
|
||||
|
||||
checkDocumentLanguage(dld);
|
||||
|
Loading…
Reference in New Issue
Block a user