(converter) Penalize chatgpt content farm spam

This commit is contained in:
Viktor Lofgren 2024-01-03 16:51:26 +01:00
parent 109bec372c
commit e53bb70bef

View File

@ -11,6 +11,7 @@ import org.jsoup.nodes.Element;
import org.jsoup.nodes.Node;
import org.jsoup.select.NodeVisitor;
import java.util.List;
import java.util.Set;
public class DocumentValuator {
@ -21,6 +22,7 @@ public class DocumentValuator {
int textLength) throws DisqualifiedException {
double scriptPenalty = getScriptPenalty(parsedDocument);
double chatGptPenalty = getChatGptContentFarmPenalty(parsedDocument);
int rawLength = crawledDocument.documentBody.length();
@ -30,7 +32,36 @@ public class DocumentValuator {
return Math.log(textLength / (double) (1+rawLength))*htmlStandard.scale
+ htmlStandard.offset
- scriptPenalty;
- scriptPenalty
- chatGptPenalty;
}
private double getChatGptContentFarmPenalty(Document parsedDocument) {
// easily 90% of modern AI-authored content farm spam has these nonsense headers
boolean benefitsOf = false, keyBenefits = false, keyTakeaways = false;
outer:
for (String tagName : List.of("h1", "h2", "h3")) {
for (var elem : parsedDocument.getElementsByTag(tagName)) {
if (benefitsOf && keyBenefits && keyTakeaways)
break outer;
String text = elem.text().toLowerCase();
benefitsOf = benefitsOf || text.startsWith("benefits of");
keyBenefits = keyBenefits || text.startsWith("key benefits");
keyTakeaways = keyTakeaways || text.startsWith("key takeaways");
}
}
double penalty = 0;
if (benefitsOf) penalty += 10;
if (keyBenefits) penalty += 5;
if (keyTakeaways) penalty += 5;
return penalty;
}