From d2fdaafc7a4a51b175642598e6475fc57253b07b Mon Sep 17 00:00:00 2001 From: Viktor Lofgren Date: Fri, 30 Jun 2023 17:10:25 +0200 Subject: [PATCH] Big brain web developers were using onload and onerror handlers to load JS without script tags... --- .../processor/logic/DocumentValuator.java | 13 ++++++++---- .../processor/logic/FeatureExtractor.java | 20 ++++++++++++++++++- 2 files changed, 28 insertions(+), 5 deletions(-) diff --git a/code/processes/converting-process/src/main/java/nu/marginalia/converting/processor/logic/DocumentValuator.java b/code/processes/converting-process/src/main/java/nu/marginalia/converting/processor/logic/DocumentValuator.java index 59b208ec..9de7af57 100644 --- a/code/processes/converting-process/src/main/java/nu/marginalia/converting/processor/logic/DocumentValuator.java +++ b/code/processes/converting-process/src/main/java/nu/marginalia/converting/processor/logic/DocumentValuator.java @@ -36,8 +36,15 @@ public class DocumentValuator { var scriptVisitor = new ScriptVisitor(); parsed.getElementsByTag("script").traverse(scriptVisitor); + int value = scriptVisitor.score(); - return scriptVisitor.score(); + for (var links : parsed.head().getElementsByTag("link")) { + if (links.hasAttr("onerror") || links.hasAttr("onload")) { + value += 1; + } + } + + return value; } private static class ScriptVisitor implements NodeVisitor { @@ -56,7 +63,6 @@ public class DocumentValuator { } else if (node instanceof TextNode tn) { visitScriptText(tn); - } } @@ -73,8 +79,7 @@ public class DocumentValuator { String srcAttr = el.attr("src"); if (srcAttr.contains("wp-content") || srcAttr.contains("wp-includes") || srcAttr.contains("jquery")) { penalty += 0.49; - } - else if (!Strings.isBlank(srcAttr)) { + } else if (!Strings.isBlank(srcAttr)) { penalty += 1; } } diff --git a/code/processes/converting-process/src/main/java/nu/marginalia/converting/processor/logic/FeatureExtractor.java b/code/processes/converting-process/src/main/java/nu/marginalia/converting/processor/logic/FeatureExtractor.java index fea500d3..2ea690f1 100644 --- a/code/processes/converting-process/src/main/java/nu/marginalia/converting/processor/logic/FeatureExtractor.java +++ b/code/processes/converting-process/src/main/java/nu/marginalia/converting/processor/logic/FeatureExtractor.java @@ -76,6 +76,19 @@ public class FeatureExtractor { } } + // 500 IQ web developers use error or load handlers + // sneakily load JS without explicit script tags + for (var link : doc.head().getElementsByTag("link")) { + if (link.hasAttr("onerror")) { + features.add(HtmlFeature.JS); + break; + } + if (link.hasAttr("onload")) { + features.add(HtmlFeature.JS); + break; + } + } + if (features.contains(HtmlFeature.JS) && adblockSimulator.hasAds(doc.clone())) { features.add(HtmlFeature.ADVERTISEMENT); } @@ -117,8 +130,13 @@ public class FeatureExtractor { } private boolean hasTrackingScript(Element scriptTag) { + return hasTrackingScript(scriptTag.attr("src")); + } + + private boolean hasTrackingScript(String scriptText) { + for (var tracker : trackers) { - if (scriptTag.attr("src").contains(tracker)) { + if (scriptText.contains(tracker)) { return true; } }