mirror of
https://github.com/MarginaliaSearch/MarginaliaSearch.git
synced 2025-02-23 21:18:58 +00:00
(converter) Treat adtech tracking as advertisement.
This commit is contained in:
parent
b5ed21be21
commit
ce293029c7
@ -3,11 +3,15 @@ package nu.marginalia.model.crawl;
|
||||
import java.util.Collection;
|
||||
|
||||
public enum HtmlFeature {
|
||||
// Note, the first 32 of these features are bit encoded in the database
|
||||
// so be sure to keep anything that's potentially important toward the top
|
||||
// of the list
|
||||
|
||||
MEDIA( "special:media"),
|
||||
JS("special:scripts"),
|
||||
AFFILIATE_LINK( "special:affiliate"),
|
||||
TRACKING_INNOCENT("special:tracking"),
|
||||
TRACKING_EVIL("special:tracking2"),
|
||||
TRACKING("special:tracking"),
|
||||
TRACKING_ADTECH("special:ads"), // We'll this as ads for now
|
||||
|
||||
VIEWPORT("special:viewport"),
|
||||
|
||||
|
@ -4,7 +4,6 @@ import com.google.inject.Inject;
|
||||
import com.google.inject.Singleton;
|
||||
import nu.marginalia.adblock.AdblockSimulator;
|
||||
import nu.marginalia.adblock.GoogleAnwersSpamDetector;
|
||||
import nu.marginalia.crawling.model.CrawledDomain;
|
||||
import nu.marginalia.language.model.DocumentLanguageData;
|
||||
import nu.marginalia.model.crawl.HtmlFeature;
|
||||
import nu.marginalia.topic.RecipeDetector;
|
||||
@ -25,9 +24,11 @@ public class FeatureExtractor {
|
||||
"twitter.com",
|
||||
"bing.com",
|
||||
"msn.com");
|
||||
private static final List<String> shittyTrackers = List.of("adform.net",
|
||||
private static final List<String> adtechTrackers = List.of("adform.net",
|
||||
"connect.facebook",
|
||||
"facebook.com/tr",
|
||||
"absbygoogle.com",
|
||||
"adnxs.com",
|
||||
"googletagmanager.com",
|
||||
"googlesyndication.com",
|
||||
"smartadserver.com",
|
||||
@ -203,11 +204,11 @@ public class FeatureExtractor {
|
||||
|
||||
for (var scriptTag : scriptTags) {
|
||||
if (hasInvasiveTrackingScript(scriptTag)) {
|
||||
features.add(HtmlFeature.TRACKING_INNOCENT);
|
||||
features.add(HtmlFeature.TRACKING_EVIL);
|
||||
features.add(HtmlFeature.TRACKING);
|
||||
features.add(HtmlFeature.TRACKING_ADTECH);
|
||||
}
|
||||
else if (hasNaiveTrackingScript(scriptTag)) {
|
||||
features.add(HtmlFeature.TRACKING_INNOCENT);
|
||||
features.add(HtmlFeature.TRACKING);
|
||||
}
|
||||
|
||||
if (scriptTag.hasAttr("didomi/javascript")) {
|
||||
@ -234,42 +235,44 @@ public class FeatureExtractor {
|
||||
features.add(HtmlFeature.COOKIELAW);
|
||||
}
|
||||
if (scriptText.contains("_linkedin_data_partner_id")) {
|
||||
features.add(HtmlFeature.TRACKING_EVIL);
|
||||
features.add(HtmlFeature.TRACKING);
|
||||
features.add(HtmlFeature.TRACKING_ADTECH);
|
||||
}
|
||||
if (scriptText.contains("window.OneSignal")) {
|
||||
features.add(HtmlFeature.ONESIGNAL);
|
||||
}
|
||||
if (scriptText.contains("connect.facebook.net")) {
|
||||
features.add(HtmlFeature.TRACKING_EVIL);
|
||||
features.add(HtmlFeature.TRACKING);
|
||||
features.add(HtmlFeature.TRACKING_ADTECH);
|
||||
}
|
||||
if (scriptText.contains("hotjar.com")) {
|
||||
features.add(HtmlFeature.TRACKING_INNOCENT);
|
||||
features.add(HtmlFeature.TRACKING);
|
||||
}
|
||||
}
|
||||
|
||||
for (var noscript : doc.getElementsByTag("noscript")) {
|
||||
for (var iframe : noscript.getElementsByTag("iframe")) {
|
||||
if (hasInvasiveTrackingScript(iframe)) {
|
||||
features.add(HtmlFeature.TRACKING_INNOCENT);
|
||||
features.add(HtmlFeature.TRACKING_EVIL);
|
||||
features.add(HtmlFeature.TRACKING);
|
||||
features.add(HtmlFeature.TRACKING_ADTECH);
|
||||
}
|
||||
else if (hasNaiveTrackingScript(iframe)) {
|
||||
features.add(HtmlFeature.TRACKING_INNOCENT);
|
||||
features.add(HtmlFeature.TRACKING);
|
||||
}
|
||||
}
|
||||
for (var img : noscript.getElementsByTag("img")) {
|
||||
if (hasInvasiveTrackingScript(img)) {
|
||||
features.add(HtmlFeature.TRACKING_INNOCENT);
|
||||
features.add(HtmlFeature.TRACKING_EVIL);
|
||||
features.add(HtmlFeature.TRACKING);
|
||||
features.add(HtmlFeature.TRACKING_ADTECH);
|
||||
}
|
||||
else if (hasNaiveTrackingScript(img)) {
|
||||
features.add(HtmlFeature.TRACKING_INNOCENT);
|
||||
features.add(HtmlFeature.TRACKING);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
if (scriptTags.html().contains("google-analytics.com")) {
|
||||
features.add(HtmlFeature.TRACKING_INNOCENT);
|
||||
features.add(HtmlFeature.TRACKING);
|
||||
}
|
||||
|
||||
for (var aTag : doc.getElementsByTag("a")) {
|
||||
@ -296,7 +299,7 @@ public class FeatureExtractor {
|
||||
}
|
||||
private boolean hasInvasiveTrackingScript(String src) {
|
||||
|
||||
for (var tracker : shittyTrackers) {
|
||||
for (var tracker : adtechTrackers) {
|
||||
if (src.contains(tracker)) {
|
||||
return true;
|
||||
}
|
||||
|
@ -121,7 +121,8 @@ public class UrlDetails {
|
||||
|
||||
for (var problem :EnumSet.of(
|
||||
HtmlFeature.JS,
|
||||
HtmlFeature.TRACKING_INNOCENT,
|
||||
HtmlFeature.TRACKING,
|
||||
HtmlFeature.TRACKING_ADTECH,
|
||||
HtmlFeature.AFFILIATE_LINK,
|
||||
HtmlFeature.COOKIES,
|
||||
HtmlFeature.ADVERTISEMENT)) {
|
||||
@ -156,7 +157,7 @@ public class UrlDetails {
|
||||
return HtmlFeature.hasFeature(features, HtmlFeature.JS);
|
||||
}
|
||||
public boolean isTracking() {
|
||||
return HtmlFeature.hasFeature(features, HtmlFeature.TRACKING_INNOCENT);
|
||||
return HtmlFeature.hasFeature(features, HtmlFeature.TRACKING);
|
||||
}
|
||||
public boolean isAffiliate() {
|
||||
return HtmlFeature.hasFeature(features, HtmlFeature.AFFILIATE_LINK);
|
||||
|
Loading…
Reference in New Issue
Block a user