(converter) Treat adtech tracking as advertisement.

This commit is contained in:
Viktor Lofgren 2023-08-09 14:23:53 +02:00
parent b5ed21be21
commit ce293029c7
3 changed files with 28 additions and 20 deletions

View File

@ -3,11 +3,15 @@ package nu.marginalia.model.crawl;
import java.util.Collection; import java.util.Collection;
public enum HtmlFeature { public enum HtmlFeature {
// Note, the first 32 of these features are bit encoded in the database
// so be sure to keep anything that's potentially important toward the top
// of the list
MEDIA( "special:media"), MEDIA( "special:media"),
JS("special:scripts"), JS("special:scripts"),
AFFILIATE_LINK( "special:affiliate"), AFFILIATE_LINK( "special:affiliate"),
TRACKING_INNOCENT("special:tracking"), TRACKING("special:tracking"),
TRACKING_EVIL("special:tracking2"), TRACKING_ADTECH("special:ads"), // We'll this as ads for now
VIEWPORT("special:viewport"), VIEWPORT("special:viewport"),

View File

@ -4,7 +4,6 @@ import com.google.inject.Inject;
import com.google.inject.Singleton; import com.google.inject.Singleton;
import nu.marginalia.adblock.AdblockSimulator; import nu.marginalia.adblock.AdblockSimulator;
import nu.marginalia.adblock.GoogleAnwersSpamDetector; import nu.marginalia.adblock.GoogleAnwersSpamDetector;
import nu.marginalia.crawling.model.CrawledDomain;
import nu.marginalia.language.model.DocumentLanguageData; import nu.marginalia.language.model.DocumentLanguageData;
import nu.marginalia.model.crawl.HtmlFeature; import nu.marginalia.model.crawl.HtmlFeature;
import nu.marginalia.topic.RecipeDetector; import nu.marginalia.topic.RecipeDetector;
@ -25,9 +24,11 @@ public class FeatureExtractor {
"twitter.com", "twitter.com",
"bing.com", "bing.com",
"msn.com"); "msn.com");
private static final List<String> shittyTrackers = List.of("adform.net", private static final List<String> adtechTrackers = List.of("adform.net",
"connect.facebook", "connect.facebook",
"facebook.com/tr", "facebook.com/tr",
"absbygoogle.com",
"adnxs.com",
"googletagmanager.com", "googletagmanager.com",
"googlesyndication.com", "googlesyndication.com",
"smartadserver.com", "smartadserver.com",
@ -203,11 +204,11 @@ public class FeatureExtractor {
for (var scriptTag : scriptTags) { for (var scriptTag : scriptTags) {
if (hasInvasiveTrackingScript(scriptTag)) { if (hasInvasiveTrackingScript(scriptTag)) {
features.add(HtmlFeature.TRACKING_INNOCENT); features.add(HtmlFeature.TRACKING);
features.add(HtmlFeature.TRACKING_EVIL); features.add(HtmlFeature.TRACKING_ADTECH);
} }
else if (hasNaiveTrackingScript(scriptTag)) { else if (hasNaiveTrackingScript(scriptTag)) {
features.add(HtmlFeature.TRACKING_INNOCENT); features.add(HtmlFeature.TRACKING);
} }
if (scriptTag.hasAttr("didomi/javascript")) { if (scriptTag.hasAttr("didomi/javascript")) {
@ -234,42 +235,44 @@ public class FeatureExtractor {
features.add(HtmlFeature.COOKIELAW); features.add(HtmlFeature.COOKIELAW);
} }
if (scriptText.contains("_linkedin_data_partner_id")) { if (scriptText.contains("_linkedin_data_partner_id")) {
features.add(HtmlFeature.TRACKING_EVIL); features.add(HtmlFeature.TRACKING);
features.add(HtmlFeature.TRACKING_ADTECH);
} }
if (scriptText.contains("window.OneSignal")) { if (scriptText.contains("window.OneSignal")) {
features.add(HtmlFeature.ONESIGNAL); features.add(HtmlFeature.ONESIGNAL);
} }
if (scriptText.contains("connect.facebook.net")) { if (scriptText.contains("connect.facebook.net")) {
features.add(HtmlFeature.TRACKING_EVIL); features.add(HtmlFeature.TRACKING);
features.add(HtmlFeature.TRACKING_ADTECH);
} }
if (scriptText.contains("hotjar.com")) { if (scriptText.contains("hotjar.com")) {
features.add(HtmlFeature.TRACKING_INNOCENT); features.add(HtmlFeature.TRACKING);
} }
} }
for (var noscript : doc.getElementsByTag("noscript")) { for (var noscript : doc.getElementsByTag("noscript")) {
for (var iframe : noscript.getElementsByTag("iframe")) { for (var iframe : noscript.getElementsByTag("iframe")) {
if (hasInvasiveTrackingScript(iframe)) { if (hasInvasiveTrackingScript(iframe)) {
features.add(HtmlFeature.TRACKING_INNOCENT); features.add(HtmlFeature.TRACKING);
features.add(HtmlFeature.TRACKING_EVIL); features.add(HtmlFeature.TRACKING_ADTECH);
} }
else if (hasNaiveTrackingScript(iframe)) { else if (hasNaiveTrackingScript(iframe)) {
features.add(HtmlFeature.TRACKING_INNOCENT); features.add(HtmlFeature.TRACKING);
} }
} }
for (var img : noscript.getElementsByTag("img")) { for (var img : noscript.getElementsByTag("img")) {
if (hasInvasiveTrackingScript(img)) { if (hasInvasiveTrackingScript(img)) {
features.add(HtmlFeature.TRACKING_INNOCENT); features.add(HtmlFeature.TRACKING);
features.add(HtmlFeature.TRACKING_EVIL); features.add(HtmlFeature.TRACKING_ADTECH);
} }
else if (hasNaiveTrackingScript(img)) { else if (hasNaiveTrackingScript(img)) {
features.add(HtmlFeature.TRACKING_INNOCENT); features.add(HtmlFeature.TRACKING);
} }
} }
} }
if (scriptTags.html().contains("google-analytics.com")) { if (scriptTags.html().contains("google-analytics.com")) {
features.add(HtmlFeature.TRACKING_INNOCENT); features.add(HtmlFeature.TRACKING);
} }
for (var aTag : doc.getElementsByTag("a")) { for (var aTag : doc.getElementsByTag("a")) {
@ -296,7 +299,7 @@ public class FeatureExtractor {
} }
private boolean hasInvasiveTrackingScript(String src) { private boolean hasInvasiveTrackingScript(String src) {
for (var tracker : shittyTrackers) { for (var tracker : adtechTrackers) {
if (src.contains(tracker)) { if (src.contains(tracker)) {
return true; return true;
} }

View File

@ -121,7 +121,8 @@ public class UrlDetails {
for (var problem :EnumSet.of( for (var problem :EnumSet.of(
HtmlFeature.JS, HtmlFeature.JS,
HtmlFeature.TRACKING_INNOCENT, HtmlFeature.TRACKING,
HtmlFeature.TRACKING_ADTECH,
HtmlFeature.AFFILIATE_LINK, HtmlFeature.AFFILIATE_LINK,
HtmlFeature.COOKIES, HtmlFeature.COOKIES,
HtmlFeature.ADVERTISEMENT)) { HtmlFeature.ADVERTISEMENT)) {
@ -156,7 +157,7 @@ public class UrlDetails {
return HtmlFeature.hasFeature(features, HtmlFeature.JS); return HtmlFeature.hasFeature(features, HtmlFeature.JS);
} }
public boolean isTracking() { public boolean isTracking() {
return HtmlFeature.hasFeature(features, HtmlFeature.TRACKING_INNOCENT); return HtmlFeature.hasFeature(features, HtmlFeature.TRACKING);
} }
public boolean isAffiliate() { public boolean isAffiliate() {
return HtmlFeature.hasFeature(features, HtmlFeature.AFFILIATE_LINK); return HtmlFeature.hasFeature(features, HtmlFeature.AFFILIATE_LINK);