From 41a540a6294d95c398a54d901a531d012de2a9c1 Mon Sep 17 00:00:00 2001 From: Viktor Lofgren Date: Wed, 3 Jan 2024 17:04:38 +0100 Subject: [PATCH] (converter) Penalize chatgpt content farm spam --- .../processor/logic/DocumentValuator.java | 24 +++++++++++++++---- 1 file changed, 19 insertions(+), 5 deletions(-) diff --git a/code/processes/converting-process/src/main/java/nu/marginalia/converting/processor/logic/DocumentValuator.java b/code/processes/converting-process/src/main/java/nu/marginalia/converting/processor/logic/DocumentValuator.java index a64277c5..af080a3a 100644 --- a/code/processes/converting-process/src/main/java/nu/marginalia/converting/processor/logic/DocumentValuator.java +++ b/code/processes/converting-process/src/main/java/nu/marginalia/converting/processor/logic/DocumentValuator.java @@ -37,17 +37,31 @@ public class DocumentValuator { } private double getChatGptContentFarmPenalty(Document parsedDocument) { - // easily 90% of modern AI-authored content farm spam have this exact string in one of the headings + // easily 90% of modern AI-authored content farm spam has these nonsense headers + boolean benefitsOf = false, keyBenefits = false, keyTakeaways = false; + + outer: for (String tagName : List.of("h1", "h2", "h3")) { for (var elem : parsedDocument.getElementsByTag(tagName)) { - if (elem.text().startsWith("Benefits of")) { - return 10; - } + if (benefitsOf && keyBenefits && keyTakeaways) + break outer; + + String text = elem.text().toLowerCase(); + + benefitsOf = benefitsOf || text.startsWith("benefits of"); + keyBenefits = keyBenefits || text.startsWith("key benefits"); + keyTakeaways = keyTakeaways || text.startsWith("key takeaways"); } } - return 0; + double penalty = 0; + + if (benefitsOf) penalty += 10; + if (keyBenefits) penalty += 5; + if (keyTakeaways) penalty += 5; + + return penalty; }