mirror of
https://github.com/MarginaliaSearch/MarginaliaSearch.git
synced 2025-02-23 21:18:58 +00:00
(converter) Treat adtech tracking as advertisement.
This commit is contained in:
parent
b5ed21be21
commit
ce293029c7
@ -3,11 +3,15 @@ package nu.marginalia.model.crawl;
|
|||||||
import java.util.Collection;
|
import java.util.Collection;
|
||||||
|
|
||||||
public enum HtmlFeature {
|
public enum HtmlFeature {
|
||||||
|
// Note, the first 32 of these features are bit encoded in the database
|
||||||
|
// so be sure to keep anything that's potentially important toward the top
|
||||||
|
// of the list
|
||||||
|
|
||||||
MEDIA( "special:media"),
|
MEDIA( "special:media"),
|
||||||
JS("special:scripts"),
|
JS("special:scripts"),
|
||||||
AFFILIATE_LINK( "special:affiliate"),
|
AFFILIATE_LINK( "special:affiliate"),
|
||||||
TRACKING_INNOCENT("special:tracking"),
|
TRACKING("special:tracking"),
|
||||||
TRACKING_EVIL("special:tracking2"),
|
TRACKING_ADTECH("special:ads"), // We'll this as ads for now
|
||||||
|
|
||||||
VIEWPORT("special:viewport"),
|
VIEWPORT("special:viewport"),
|
||||||
|
|
||||||
|
@ -4,7 +4,6 @@ import com.google.inject.Inject;
|
|||||||
import com.google.inject.Singleton;
|
import com.google.inject.Singleton;
|
||||||
import nu.marginalia.adblock.AdblockSimulator;
|
import nu.marginalia.adblock.AdblockSimulator;
|
||||||
import nu.marginalia.adblock.GoogleAnwersSpamDetector;
|
import nu.marginalia.adblock.GoogleAnwersSpamDetector;
|
||||||
import nu.marginalia.crawling.model.CrawledDomain;
|
|
||||||
import nu.marginalia.language.model.DocumentLanguageData;
|
import nu.marginalia.language.model.DocumentLanguageData;
|
||||||
import nu.marginalia.model.crawl.HtmlFeature;
|
import nu.marginalia.model.crawl.HtmlFeature;
|
||||||
import nu.marginalia.topic.RecipeDetector;
|
import nu.marginalia.topic.RecipeDetector;
|
||||||
@ -25,9 +24,11 @@ public class FeatureExtractor {
|
|||||||
"twitter.com",
|
"twitter.com",
|
||||||
"bing.com",
|
"bing.com",
|
||||||
"msn.com");
|
"msn.com");
|
||||||
private static final List<String> shittyTrackers = List.of("adform.net",
|
private static final List<String> adtechTrackers = List.of("adform.net",
|
||||||
"connect.facebook",
|
"connect.facebook",
|
||||||
"facebook.com/tr",
|
"facebook.com/tr",
|
||||||
|
"absbygoogle.com",
|
||||||
|
"adnxs.com",
|
||||||
"googletagmanager.com",
|
"googletagmanager.com",
|
||||||
"googlesyndication.com",
|
"googlesyndication.com",
|
||||||
"smartadserver.com",
|
"smartadserver.com",
|
||||||
@ -203,11 +204,11 @@ public class FeatureExtractor {
|
|||||||
|
|
||||||
for (var scriptTag : scriptTags) {
|
for (var scriptTag : scriptTags) {
|
||||||
if (hasInvasiveTrackingScript(scriptTag)) {
|
if (hasInvasiveTrackingScript(scriptTag)) {
|
||||||
features.add(HtmlFeature.TRACKING_INNOCENT);
|
features.add(HtmlFeature.TRACKING);
|
||||||
features.add(HtmlFeature.TRACKING_EVIL);
|
features.add(HtmlFeature.TRACKING_ADTECH);
|
||||||
}
|
}
|
||||||
else if (hasNaiveTrackingScript(scriptTag)) {
|
else if (hasNaiveTrackingScript(scriptTag)) {
|
||||||
features.add(HtmlFeature.TRACKING_INNOCENT);
|
features.add(HtmlFeature.TRACKING);
|
||||||
}
|
}
|
||||||
|
|
||||||
if (scriptTag.hasAttr("didomi/javascript")) {
|
if (scriptTag.hasAttr("didomi/javascript")) {
|
||||||
@ -234,42 +235,44 @@ public class FeatureExtractor {
|
|||||||
features.add(HtmlFeature.COOKIELAW);
|
features.add(HtmlFeature.COOKIELAW);
|
||||||
}
|
}
|
||||||
if (scriptText.contains("_linkedin_data_partner_id")) {
|
if (scriptText.contains("_linkedin_data_partner_id")) {
|
||||||
features.add(HtmlFeature.TRACKING_EVIL);
|
features.add(HtmlFeature.TRACKING);
|
||||||
|
features.add(HtmlFeature.TRACKING_ADTECH);
|
||||||
}
|
}
|
||||||
if (scriptText.contains("window.OneSignal")) {
|
if (scriptText.contains("window.OneSignal")) {
|
||||||
features.add(HtmlFeature.ONESIGNAL);
|
features.add(HtmlFeature.ONESIGNAL);
|
||||||
}
|
}
|
||||||
if (scriptText.contains("connect.facebook.net")) {
|
if (scriptText.contains("connect.facebook.net")) {
|
||||||
features.add(HtmlFeature.TRACKING_EVIL);
|
features.add(HtmlFeature.TRACKING);
|
||||||
|
features.add(HtmlFeature.TRACKING_ADTECH);
|
||||||
}
|
}
|
||||||
if (scriptText.contains("hotjar.com")) {
|
if (scriptText.contains("hotjar.com")) {
|
||||||
features.add(HtmlFeature.TRACKING_INNOCENT);
|
features.add(HtmlFeature.TRACKING);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
for (var noscript : doc.getElementsByTag("noscript")) {
|
for (var noscript : doc.getElementsByTag("noscript")) {
|
||||||
for (var iframe : noscript.getElementsByTag("iframe")) {
|
for (var iframe : noscript.getElementsByTag("iframe")) {
|
||||||
if (hasInvasiveTrackingScript(iframe)) {
|
if (hasInvasiveTrackingScript(iframe)) {
|
||||||
features.add(HtmlFeature.TRACKING_INNOCENT);
|
features.add(HtmlFeature.TRACKING);
|
||||||
features.add(HtmlFeature.TRACKING_EVIL);
|
features.add(HtmlFeature.TRACKING_ADTECH);
|
||||||
}
|
}
|
||||||
else if (hasNaiveTrackingScript(iframe)) {
|
else if (hasNaiveTrackingScript(iframe)) {
|
||||||
features.add(HtmlFeature.TRACKING_INNOCENT);
|
features.add(HtmlFeature.TRACKING);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
for (var img : noscript.getElementsByTag("img")) {
|
for (var img : noscript.getElementsByTag("img")) {
|
||||||
if (hasInvasiveTrackingScript(img)) {
|
if (hasInvasiveTrackingScript(img)) {
|
||||||
features.add(HtmlFeature.TRACKING_INNOCENT);
|
features.add(HtmlFeature.TRACKING);
|
||||||
features.add(HtmlFeature.TRACKING_EVIL);
|
features.add(HtmlFeature.TRACKING_ADTECH);
|
||||||
}
|
}
|
||||||
else if (hasNaiveTrackingScript(img)) {
|
else if (hasNaiveTrackingScript(img)) {
|
||||||
features.add(HtmlFeature.TRACKING_INNOCENT);
|
features.add(HtmlFeature.TRACKING);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
if (scriptTags.html().contains("google-analytics.com")) {
|
if (scriptTags.html().contains("google-analytics.com")) {
|
||||||
features.add(HtmlFeature.TRACKING_INNOCENT);
|
features.add(HtmlFeature.TRACKING);
|
||||||
}
|
}
|
||||||
|
|
||||||
for (var aTag : doc.getElementsByTag("a")) {
|
for (var aTag : doc.getElementsByTag("a")) {
|
||||||
@ -296,7 +299,7 @@ public class FeatureExtractor {
|
|||||||
}
|
}
|
||||||
private boolean hasInvasiveTrackingScript(String src) {
|
private boolean hasInvasiveTrackingScript(String src) {
|
||||||
|
|
||||||
for (var tracker : shittyTrackers) {
|
for (var tracker : adtechTrackers) {
|
||||||
if (src.contains(tracker)) {
|
if (src.contains(tracker)) {
|
||||||
return true;
|
return true;
|
||||||
}
|
}
|
||||||
|
@ -121,7 +121,8 @@ public class UrlDetails {
|
|||||||
|
|
||||||
for (var problem :EnumSet.of(
|
for (var problem :EnumSet.of(
|
||||||
HtmlFeature.JS,
|
HtmlFeature.JS,
|
||||||
HtmlFeature.TRACKING_INNOCENT,
|
HtmlFeature.TRACKING,
|
||||||
|
HtmlFeature.TRACKING_ADTECH,
|
||||||
HtmlFeature.AFFILIATE_LINK,
|
HtmlFeature.AFFILIATE_LINK,
|
||||||
HtmlFeature.COOKIES,
|
HtmlFeature.COOKIES,
|
||||||
HtmlFeature.ADVERTISEMENT)) {
|
HtmlFeature.ADVERTISEMENT)) {
|
||||||
@ -156,7 +157,7 @@ public class UrlDetails {
|
|||||||
return HtmlFeature.hasFeature(features, HtmlFeature.JS);
|
return HtmlFeature.hasFeature(features, HtmlFeature.JS);
|
||||||
}
|
}
|
||||||
public boolean isTracking() {
|
public boolean isTracking() {
|
||||||
return HtmlFeature.hasFeature(features, HtmlFeature.TRACKING_INNOCENT);
|
return HtmlFeature.hasFeature(features, HtmlFeature.TRACKING);
|
||||||
}
|
}
|
||||||
public boolean isAffiliate() {
|
public boolean isAffiliate() {
|
||||||
return HtmlFeature.hasFeature(features, HtmlFeature.AFFILIATE_LINK);
|
return HtmlFeature.hasFeature(features, HtmlFeature.AFFILIATE_LINK);
|
||||||
|
Loading…
Reference in New Issue
Block a user