mirror of
https://github.com/MarginaliaSearch/MarginaliaSearch.git
synced 2025-02-23 21:18:58 +00:00
(converter) Penalize chatgpt content farm spam
This commit is contained in:
parent
f599944942
commit
41a540a629
@ -37,17 +37,31 @@ public class DocumentValuator {
|
|||||||
}
|
}
|
||||||
|
|
||||||
private double getChatGptContentFarmPenalty(Document parsedDocument) {
|
private double getChatGptContentFarmPenalty(Document parsedDocument) {
|
||||||
// easily 90% of modern AI-authored content farm spam have this exact string in one of the headings
|
// easily 90% of modern AI-authored content farm spam has these nonsense headers
|
||||||
|
|
||||||
|
boolean benefitsOf = false, keyBenefits = false, keyTakeaways = false;
|
||||||
|
|
||||||
|
outer:
|
||||||
for (String tagName : List.of("h1", "h2", "h3")) {
|
for (String tagName : List.of("h1", "h2", "h3")) {
|
||||||
for (var elem : parsedDocument.getElementsByTag(tagName)) {
|
for (var elem : parsedDocument.getElementsByTag(tagName)) {
|
||||||
if (elem.text().startsWith("Benefits of")) {
|
if (benefitsOf && keyBenefits && keyTakeaways)
|
||||||
return 10;
|
break outer;
|
||||||
}
|
|
||||||
|
String text = elem.text().toLowerCase();
|
||||||
|
|
||||||
|
benefitsOf = benefitsOf || text.startsWith("benefits of");
|
||||||
|
keyBenefits = keyBenefits || text.startsWith("key benefits");
|
||||||
|
keyTakeaways = keyTakeaways || text.startsWith("key takeaways");
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
return 0;
|
double penalty = 0;
|
||||||
|
|
||||||
|
if (benefitsOf) penalty += 10;
|
||||||
|
if (keyBenefits) penalty += 5;
|
||||||
|
if (keyTakeaways) penalty += 5;
|
||||||
|
|
||||||
|
return penalty;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
|
Loading…
Reference in New Issue
Block a user