From f5999449426ec143df72f55f460f042abe0d8a58 Mon Sep 17 00:00:00 2001 From: Viktor Lofgren Date: Wed, 3 Jan 2024 16:51:26 +0100 Subject: [PATCH] (converter) Penalize chatgpt content farm spam --- .../processor/logic/DocumentValuator.java | 19 ++++++++++++++++++- 1 file changed, 18 insertions(+), 1 deletion(-) diff --git a/code/processes/converting-process/src/main/java/nu/marginalia/converting/processor/logic/DocumentValuator.java b/code/processes/converting-process/src/main/java/nu/marginalia/converting/processor/logic/DocumentValuator.java index 218f16b8..a64277c5 100644 --- a/code/processes/converting-process/src/main/java/nu/marginalia/converting/processor/logic/DocumentValuator.java +++ b/code/processes/converting-process/src/main/java/nu/marginalia/converting/processor/logic/DocumentValuator.java @@ -11,6 +11,7 @@ import org.jsoup.nodes.Element; import org.jsoup.nodes.Node; import org.jsoup.select.NodeVisitor; +import java.util.List; import java.util.Set; public class DocumentValuator { @@ -21,6 +22,7 @@ public class DocumentValuator { int textLength) throws DisqualifiedException { double scriptPenalty = getScriptPenalty(parsedDocument); + double chatGptPenalty = getChatGptContentFarmPenalty(parsedDocument); int rawLength = crawledDocument.documentBody.length(); @@ -30,7 +32,22 @@ public class DocumentValuator { return Math.log(textLength / (double) (1+rawLength))*htmlStandard.scale + htmlStandard.offset - - scriptPenalty; + - scriptPenalty + - chatGptPenalty; + } + + private double getChatGptContentFarmPenalty(Document parsedDocument) { + // easily 90% of modern AI-authored content farm spam have this exact string in one of the headings + + for (String tagName : List.of("h1", "h2", "h3")) { + for (var elem : parsedDocument.getElementsByTag(tagName)) { + if (elem.text().startsWith("Benefits of")) { + return 10; + } + } + } + + return 0; }