diff --git a/code/common/model/src/main/java/nu/marginalia/model/crawl/HtmlFeature.java b/code/common/model/src/main/java/nu/marginalia/model/crawl/HtmlFeature.java index d9adbff6..03e5557c 100644 --- a/code/common/model/src/main/java/nu/marginalia/model/crawl/HtmlFeature.java +++ b/code/common/model/src/main/java/nu/marginalia/model/crawl/HtmlFeature.java @@ -3,11 +3,15 @@ package nu.marginalia.model.crawl; import java.util.Collection; public enum HtmlFeature { + // Note, the first 32 of these features are bit encoded in the database + // so be sure to keep anything that's potentially important toward the top + // of the list + MEDIA( "special:media"), JS("special:scripts"), AFFILIATE_LINK( "special:affiliate"), - TRACKING_INNOCENT("special:tracking"), - TRACKING_EVIL("special:tracking2"), + TRACKING("special:tracking"), + TRACKING_ADTECH("special:ads"), // We'll this as ads for now VIEWPORT("special:viewport"), diff --git a/code/processes/converting-process/src/main/java/nu/marginalia/converting/processor/logic/FeatureExtractor.java b/code/processes/converting-process/src/main/java/nu/marginalia/converting/processor/logic/FeatureExtractor.java index c431e94b..040f96dd 100644 --- a/code/processes/converting-process/src/main/java/nu/marginalia/converting/processor/logic/FeatureExtractor.java +++ b/code/processes/converting-process/src/main/java/nu/marginalia/converting/processor/logic/FeatureExtractor.java @@ -4,7 +4,6 @@ import com.google.inject.Inject; import com.google.inject.Singleton; import nu.marginalia.adblock.AdblockSimulator; import nu.marginalia.adblock.GoogleAnwersSpamDetector; -import nu.marginalia.crawling.model.CrawledDomain; import nu.marginalia.language.model.DocumentLanguageData; import nu.marginalia.model.crawl.HtmlFeature; import nu.marginalia.topic.RecipeDetector; @@ -25,9 +24,11 @@ public class FeatureExtractor { "twitter.com", "bing.com", "msn.com"); - private static final List shittyTrackers = List.of("adform.net", + private static final List adtechTrackers = List.of("adform.net", "connect.facebook", "facebook.com/tr", + "absbygoogle.com", + "adnxs.com", "googletagmanager.com", "googlesyndication.com", "smartadserver.com", @@ -203,11 +204,11 @@ public class FeatureExtractor { for (var scriptTag : scriptTags) { if (hasInvasiveTrackingScript(scriptTag)) { - features.add(HtmlFeature.TRACKING_INNOCENT); - features.add(HtmlFeature.TRACKING_EVIL); + features.add(HtmlFeature.TRACKING); + features.add(HtmlFeature.TRACKING_ADTECH); } else if (hasNaiveTrackingScript(scriptTag)) { - features.add(HtmlFeature.TRACKING_INNOCENT); + features.add(HtmlFeature.TRACKING); } if (scriptTag.hasAttr("didomi/javascript")) { @@ -234,42 +235,44 @@ public class FeatureExtractor { features.add(HtmlFeature.COOKIELAW); } if (scriptText.contains("_linkedin_data_partner_id")) { - features.add(HtmlFeature.TRACKING_EVIL); + features.add(HtmlFeature.TRACKING); + features.add(HtmlFeature.TRACKING_ADTECH); } if (scriptText.contains("window.OneSignal")) { features.add(HtmlFeature.ONESIGNAL); } if (scriptText.contains("connect.facebook.net")) { - features.add(HtmlFeature.TRACKING_EVIL); + features.add(HtmlFeature.TRACKING); + features.add(HtmlFeature.TRACKING_ADTECH); } if (scriptText.contains("hotjar.com")) { - features.add(HtmlFeature.TRACKING_INNOCENT); + features.add(HtmlFeature.TRACKING); } } for (var noscript : doc.getElementsByTag("noscript")) { for (var iframe : noscript.getElementsByTag("iframe")) { if (hasInvasiveTrackingScript(iframe)) { - features.add(HtmlFeature.TRACKING_INNOCENT); - features.add(HtmlFeature.TRACKING_EVIL); + features.add(HtmlFeature.TRACKING); + features.add(HtmlFeature.TRACKING_ADTECH); } else if (hasNaiveTrackingScript(iframe)) { - features.add(HtmlFeature.TRACKING_INNOCENT); + features.add(HtmlFeature.TRACKING); } } for (var img : noscript.getElementsByTag("img")) { if (hasInvasiveTrackingScript(img)) { - features.add(HtmlFeature.TRACKING_INNOCENT); - features.add(HtmlFeature.TRACKING_EVIL); + features.add(HtmlFeature.TRACKING); + features.add(HtmlFeature.TRACKING_ADTECH); } else if (hasNaiveTrackingScript(img)) { - features.add(HtmlFeature.TRACKING_INNOCENT); + features.add(HtmlFeature.TRACKING); } } } if (scriptTags.html().contains("google-analytics.com")) { - features.add(HtmlFeature.TRACKING_INNOCENT); + features.add(HtmlFeature.TRACKING); } for (var aTag : doc.getElementsByTag("a")) { @@ -296,7 +299,7 @@ public class FeatureExtractor { } private boolean hasInvasiveTrackingScript(String src) { - for (var tracker : shittyTrackers) { + for (var tracker : adtechTrackers) { if (src.contains(tracker)) { return true; } diff --git a/code/services-core/search-service/src/main/java/nu/marginalia/search/model/UrlDetails.java b/code/services-core/search-service/src/main/java/nu/marginalia/search/model/UrlDetails.java index b44f2551..2de12536 100644 --- a/code/services-core/search-service/src/main/java/nu/marginalia/search/model/UrlDetails.java +++ b/code/services-core/search-service/src/main/java/nu/marginalia/search/model/UrlDetails.java @@ -121,7 +121,8 @@ public class UrlDetails { for (var problem :EnumSet.of( HtmlFeature.JS, - HtmlFeature.TRACKING_INNOCENT, + HtmlFeature.TRACKING, + HtmlFeature.TRACKING_ADTECH, HtmlFeature.AFFILIATE_LINK, HtmlFeature.COOKIES, HtmlFeature.ADVERTISEMENT)) { @@ -156,7 +157,7 @@ public class UrlDetails { return HtmlFeature.hasFeature(features, HtmlFeature.JS); } public boolean isTracking() { - return HtmlFeature.hasFeature(features, HtmlFeature.TRACKING_INNOCENT); + return HtmlFeature.hasFeature(features, HtmlFeature.TRACKING); } public boolean isAffiliate() { return HtmlFeature.hasFeature(features, HtmlFeature.AFFILIATE_LINK);