mirror of
https://github.com/MarginaliaSearch/MarginaliaSearch.git
synced 2025-02-24 05:18:58 +00:00
(converter) Penalize chatgpt content farm spam
This commit is contained in:
parent
109bec372c
commit
e53bb70bef
@ -11,6 +11,7 @@ import org.jsoup.nodes.Element;
|
|||||||
import org.jsoup.nodes.Node;
|
import org.jsoup.nodes.Node;
|
||||||
import org.jsoup.select.NodeVisitor;
|
import org.jsoup.select.NodeVisitor;
|
||||||
|
|
||||||
|
import java.util.List;
|
||||||
import java.util.Set;
|
import java.util.Set;
|
||||||
|
|
||||||
public class DocumentValuator {
|
public class DocumentValuator {
|
||||||
@ -21,6 +22,7 @@ public class DocumentValuator {
|
|||||||
int textLength) throws DisqualifiedException {
|
int textLength) throws DisqualifiedException {
|
||||||
|
|
||||||
double scriptPenalty = getScriptPenalty(parsedDocument);
|
double scriptPenalty = getScriptPenalty(parsedDocument);
|
||||||
|
double chatGptPenalty = getChatGptContentFarmPenalty(parsedDocument);
|
||||||
|
|
||||||
int rawLength = crawledDocument.documentBody.length();
|
int rawLength = crawledDocument.documentBody.length();
|
||||||
|
|
||||||
@ -30,7 +32,36 @@ public class DocumentValuator {
|
|||||||
|
|
||||||
return Math.log(textLength / (double) (1+rawLength))*htmlStandard.scale
|
return Math.log(textLength / (double) (1+rawLength))*htmlStandard.scale
|
||||||
+ htmlStandard.offset
|
+ htmlStandard.offset
|
||||||
- scriptPenalty;
|
- scriptPenalty
|
||||||
|
- chatGptPenalty;
|
||||||
|
}
|
||||||
|
|
||||||
|
private double getChatGptContentFarmPenalty(Document parsedDocument) {
|
||||||
|
// easily 90% of modern AI-authored content farm spam has these nonsense headers
|
||||||
|
|
||||||
|
boolean benefitsOf = false, keyBenefits = false, keyTakeaways = false;
|
||||||
|
|
||||||
|
outer:
|
||||||
|
for (String tagName : List.of("h1", "h2", "h3")) {
|
||||||
|
for (var elem : parsedDocument.getElementsByTag(tagName)) {
|
||||||
|
if (benefitsOf && keyBenefits && keyTakeaways)
|
||||||
|
break outer;
|
||||||
|
|
||||||
|
String text = elem.text().toLowerCase();
|
||||||
|
|
||||||
|
benefitsOf = benefitsOf || text.startsWith("benefits of");
|
||||||
|
keyBenefits = keyBenefits || text.startsWith("key benefits");
|
||||||
|
keyTakeaways = keyTakeaways || text.startsWith("key takeaways");
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
double penalty = 0;
|
||||||
|
|
||||||
|
if (benefitsOf) penalty += 10;
|
||||||
|
if (keyBenefits) penalty += 5;
|
||||||
|
if (keyTakeaways) penalty += 5;
|
||||||
|
|
||||||
|
return penalty;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
|
Loading…
Reference in New Issue
Block a user