(converter) Penalize chatgpt content farm spam

2025-02-23 13:09:00 +00:00 · 2024-01-03 16:51:26 +01:00 · 2024-01-03 16:51:26 +01:00 · e53bb70bef
commit e53bb70bef
parent 109bec372c
1 changed files with 32 additions and 1 deletions
--- a/code/processes/converting-process/src/main/java/nu/marginalia/converting/processor/logic/DocumentValuator.java
+++ b/code/processes/converting-process/src/main/java/nu/marginalia/converting/processor/logic/DocumentValuator.java
@ -11,6 +11,7 @@ import org.jsoup.nodes.Element;
 import org.jsoup.nodes.Node;
 import org.jsoup.select.NodeVisitor;

+import java.util.List;
 import java.util.Set;

 public class DocumentValuator {
@ -21,6 +22,7 @@ public class DocumentValuator {
                             int textLength) throws DisqualifiedException {

        double scriptPenalty = getScriptPenalty(parsedDocument);
+        double chatGptPenalty = getChatGptContentFarmPenalty(parsedDocument);

        int rawLength = crawledDocument.documentBody.length();

@ -30,7 +32,36 @@ public class DocumentValuator {

        return Math.log(textLength / (double) (1+rawLength))*htmlStandard.scale
                + htmlStandard.offset
-                - scriptPenalty;
+                - scriptPenalty
+                - chatGptPenalty;
+    }
+
+    private double getChatGptContentFarmPenalty(Document parsedDocument) {
+        // easily 90% of modern AI-authored content farm spam has these nonsense headers
+
+        boolean benefitsOf = false, keyBenefits = false, keyTakeaways = false;
+
+        outer:
+        for (String tagName : List.of("h1", "h2", "h3")) {
+            for (var elem : parsedDocument.getElementsByTag(tagName)) {
+                if (benefitsOf && keyBenefits && keyTakeaways)
+                    break outer;
+
+                String text = elem.text().toLowerCase();
+
+                benefitsOf = benefitsOf || text.startsWith("benefits of");
+                keyBenefits = keyBenefits || text.startsWith("key benefits");
+                keyTakeaways = keyTakeaways || text.startsWith("key takeaways");
+            }
+        }
+
+        double penalty = 0;
+
+        if (benefitsOf) penalty += 10;
+        if (keyBenefits) penalty += 5;
+        if (keyTakeaways) penalty += 5;
+
+        return penalty;
    }