(converter) Treat adtech tracking as advertisement.

This commit is contained in:
Viktor Lofgren 2023-08-09 14:23:53 +02:00
parent b5ed21be21
commit ce293029c7
3 changed files with 28 additions and 20 deletions

View File

@ -3,11 +3,15 @@ package nu.marginalia.model.crawl;
import java.util.Collection;
public enum HtmlFeature {
// Note, the first 32 of these features are bit encoded in the database
// so be sure to keep anything that's potentially important toward the top
// of the list
MEDIA( "special:media"),
JS("special:scripts"),
AFFILIATE_LINK( "special:affiliate"),
TRACKING_INNOCENT("special:tracking"),
TRACKING_EVIL("special:tracking2"),
TRACKING("special:tracking"),
TRACKING_ADTECH("special:ads"), // We'll this as ads for now
VIEWPORT("special:viewport"),

View File

@ -4,7 +4,6 @@ import com.google.inject.Inject;
import com.google.inject.Singleton;
import nu.marginalia.adblock.AdblockSimulator;
import nu.marginalia.adblock.GoogleAnwersSpamDetector;
import nu.marginalia.crawling.model.CrawledDomain;
import nu.marginalia.language.model.DocumentLanguageData;
import nu.marginalia.model.crawl.HtmlFeature;
import nu.marginalia.topic.RecipeDetector;
@ -25,9 +24,11 @@ public class FeatureExtractor {
"twitter.com",
"bing.com",
"msn.com");
private static final List<String> shittyTrackers = List.of("adform.net",
private static final List<String> adtechTrackers = List.of("adform.net",
"connect.facebook",
"facebook.com/tr",
"absbygoogle.com",
"adnxs.com",
"googletagmanager.com",
"googlesyndication.com",
"smartadserver.com",
@ -203,11 +204,11 @@ public class FeatureExtractor {
for (var scriptTag : scriptTags) {
if (hasInvasiveTrackingScript(scriptTag)) {
features.add(HtmlFeature.TRACKING_INNOCENT);
features.add(HtmlFeature.TRACKING_EVIL);
features.add(HtmlFeature.TRACKING);
features.add(HtmlFeature.TRACKING_ADTECH);
}
else if (hasNaiveTrackingScript(scriptTag)) {
features.add(HtmlFeature.TRACKING_INNOCENT);
features.add(HtmlFeature.TRACKING);
}
if (scriptTag.hasAttr("didomi/javascript")) {
@ -234,42 +235,44 @@ public class FeatureExtractor {
features.add(HtmlFeature.COOKIELAW);
}
if (scriptText.contains("_linkedin_data_partner_id")) {
features.add(HtmlFeature.TRACKING_EVIL);
features.add(HtmlFeature.TRACKING);
features.add(HtmlFeature.TRACKING_ADTECH);
}
if (scriptText.contains("window.OneSignal")) {
features.add(HtmlFeature.ONESIGNAL);
}
if (scriptText.contains("connect.facebook.net")) {
features.add(HtmlFeature.TRACKING_EVIL);
features.add(HtmlFeature.TRACKING);
features.add(HtmlFeature.TRACKING_ADTECH);
}
if (scriptText.contains("hotjar.com")) {
features.add(HtmlFeature.TRACKING_INNOCENT);
features.add(HtmlFeature.TRACKING);
}
}
for (var noscript : doc.getElementsByTag("noscript")) {
for (var iframe : noscript.getElementsByTag("iframe")) {
if (hasInvasiveTrackingScript(iframe)) {
features.add(HtmlFeature.TRACKING_INNOCENT);
features.add(HtmlFeature.TRACKING_EVIL);
features.add(HtmlFeature.TRACKING);
features.add(HtmlFeature.TRACKING_ADTECH);
}
else if (hasNaiveTrackingScript(iframe)) {
features.add(HtmlFeature.TRACKING_INNOCENT);
features.add(HtmlFeature.TRACKING);
}
}
for (var img : noscript.getElementsByTag("img")) {
if (hasInvasiveTrackingScript(img)) {
features.add(HtmlFeature.TRACKING_INNOCENT);
features.add(HtmlFeature.TRACKING_EVIL);
features.add(HtmlFeature.TRACKING);
features.add(HtmlFeature.TRACKING_ADTECH);
}
else if (hasNaiveTrackingScript(img)) {
features.add(HtmlFeature.TRACKING_INNOCENT);
features.add(HtmlFeature.TRACKING);
}
}
}
if (scriptTags.html().contains("google-analytics.com")) {
features.add(HtmlFeature.TRACKING_INNOCENT);
features.add(HtmlFeature.TRACKING);
}
for (var aTag : doc.getElementsByTag("a")) {
@ -296,7 +299,7 @@ public class FeatureExtractor {
}
private boolean hasInvasiveTrackingScript(String src) {
for (var tracker : shittyTrackers) {
for (var tracker : adtechTrackers) {
if (src.contains(tracker)) {
return true;
}

View File

@ -121,7 +121,8 @@ public class UrlDetails {
for (var problem :EnumSet.of(
HtmlFeature.JS,
HtmlFeature.TRACKING_INNOCENT,
HtmlFeature.TRACKING,
HtmlFeature.TRACKING_ADTECH,
HtmlFeature.AFFILIATE_LINK,
HtmlFeature.COOKIES,
HtmlFeature.ADVERTISEMENT)) {
@ -156,7 +157,7 @@ public class UrlDetails {
return HtmlFeature.hasFeature(features, HtmlFeature.JS);
}
public boolean isTracking() {
return HtmlFeature.hasFeature(features, HtmlFeature.TRACKING_INNOCENT);
return HtmlFeature.hasFeature(features, HtmlFeature.TRACKING);
}
public boolean isAffiliate() {
return HtmlFeature.hasFeature(features, HtmlFeature.AFFILIATE_LINK);