mirror of
https://github.com/MarginaliaSearch/MarginaliaSearch.git
synced 2025-02-23 13:09:00 +00:00
(converter) Penalize chatgpt content farm spam
This commit is contained in:
parent
109bec372c
commit
e53bb70bef
@ -11,6 +11,7 @@ import org.jsoup.nodes.Element;
|
||||
import org.jsoup.nodes.Node;
|
||||
import org.jsoup.select.NodeVisitor;
|
||||
|
||||
import java.util.List;
|
||||
import java.util.Set;
|
||||
|
||||
public class DocumentValuator {
|
||||
@ -21,6 +22,7 @@ public class DocumentValuator {
|
||||
int textLength) throws DisqualifiedException {
|
||||
|
||||
double scriptPenalty = getScriptPenalty(parsedDocument);
|
||||
double chatGptPenalty = getChatGptContentFarmPenalty(parsedDocument);
|
||||
|
||||
int rawLength = crawledDocument.documentBody.length();
|
||||
|
||||
@ -30,7 +32,36 @@ public class DocumentValuator {
|
||||
|
||||
return Math.log(textLength / (double) (1+rawLength))*htmlStandard.scale
|
||||
+ htmlStandard.offset
|
||||
- scriptPenalty;
|
||||
- scriptPenalty
|
||||
- chatGptPenalty;
|
||||
}
|
||||
|
||||
private double getChatGptContentFarmPenalty(Document parsedDocument) {
|
||||
// easily 90% of modern AI-authored content farm spam has these nonsense headers
|
||||
|
||||
boolean benefitsOf = false, keyBenefits = false, keyTakeaways = false;
|
||||
|
||||
outer:
|
||||
for (String tagName : List.of("h1", "h2", "h3")) {
|
||||
for (var elem : parsedDocument.getElementsByTag(tagName)) {
|
||||
if (benefitsOf && keyBenefits && keyTakeaways)
|
||||
break outer;
|
||||
|
||||
String text = elem.text().toLowerCase();
|
||||
|
||||
benefitsOf = benefitsOf || text.startsWith("benefits of");
|
||||
keyBenefits = keyBenefits || text.startsWith("key benefits");
|
||||
keyTakeaways = keyTakeaways || text.startsWith("key takeaways");
|
||||
}
|
||||
}
|
||||
|
||||
double penalty = 0;
|
||||
|
||||
if (benefitsOf) penalty += 10;
|
||||
if (keyBenefits) penalty += 5;
|
||||
if (keyTakeaways) penalty += 5;
|
||||
|
||||
return penalty;
|
||||
}
|
||||
|
||||
|
||||
|
Loading…
Reference in New Issue
Block a user