mirror of
https://github.com/MarginaliaSearch/MarginaliaSearch.git
synced 2025-02-24 05:18:58 +00:00
Clean up artifact extractor.
This commit is contained in:
parent
535a51a621
commit
810515c08d
@ -9,30 +9,32 @@ public class ArtifactKeywords {
|
|||||||
|
|
||||||
private static final Pattern mailLikePattern = Pattern.compile("[a-zA-Z0-9._\\-]+@[a-zA-Z0-9]+(\\.[a-zA-Z0-9]+)+");
|
private static final Pattern mailLikePattern = Pattern.compile("[a-zA-Z0-9._\\-]+@[a-zA-Z0-9]+(\\.[a-zA-Z0-9]+)+");
|
||||||
|
|
||||||
private final Set<String> words;
|
private static final Set<String> ignoredDomains = Set.of("@hotmail.com", "@gmail.com", "@paypal.com");
|
||||||
|
private static final Set<String> ignoredUsers = Set.of("info", "legal", "contact", "press", "donotreply", "noreply", "no-reply", "admin", "root");
|
||||||
|
|
||||||
|
private final Set<String> words = new HashSet<>();
|
||||||
|
|
||||||
public ArtifactKeywords(DocumentLanguageData documentLanguageData) {
|
public ArtifactKeywords(DocumentLanguageData documentLanguageData) {
|
||||||
words = new HashSet<>();
|
|
||||||
|
|
||||||
for (var sent : documentLanguageData.sentences) {
|
for (var sent : documentLanguageData.sentences) {
|
||||||
for (var word : sent) {
|
for (var word : sent) {
|
||||||
String lc = word.wordLowerCase();
|
final String lc = word.wordLowerCase();
|
||||||
if (lc.length() < 6
|
final int atIdx = lc.indexOf('@');
|
||||||
|| lc.indexOf('@') < 0
|
|
||||||
|| !mailLikePattern.matcher(lc).matches()) {
|
if (lc.length() < 6 || atIdx < 0 || !mailLikePattern.matcher(lc).matches()) {
|
||||||
continue;
|
continue;
|
||||||
}
|
}
|
||||||
|
|
||||||
words.add(lc);
|
words.add(lc);
|
||||||
|
|
||||||
String domain = lc.substring(lc.indexOf('@'));
|
String domain = lc.substring(atIdx);
|
||||||
String user = lc.substring(0, lc.indexOf('@'));
|
String user = lc.substring(0, atIdx);
|
||||||
|
|
||||||
if (!domain.equals("@hotmail.com") && !domain.equals("@gmail.com") && !domain.equals("@paypal.com")) {
|
if (!ignoredDomains.contains(domain)) {
|
||||||
words.add(domain.substring(1));
|
words.add(domain.substring(1));
|
||||||
words.add(domain);
|
words.add(domain);
|
||||||
}
|
}
|
||||||
if (!user.equals("info") && !user.equals("legal") && !user.equals("contact") && !user.equals("donotreply")) {
|
if (!ignoredUsers.contains(user)) {
|
||||||
words.add(user);
|
words.add(user);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
Loading…
Reference in New Issue
Block a user