mirror of
https://github.com/MarginaliaSearch/MarginaliaSearch.git
synced 2025-02-23 13:09:00 +00:00
Clean up artifact extractor.
This commit is contained in:
parent
535a51a621
commit
810515c08d
@ -9,30 +9,32 @@ public class ArtifactKeywords {
|
||||
|
||||
private static final Pattern mailLikePattern = Pattern.compile("[a-zA-Z0-9._\\-]+@[a-zA-Z0-9]+(\\.[a-zA-Z0-9]+)+");
|
||||
|
||||
private final Set<String> words;
|
||||
private static final Set<String> ignoredDomains = Set.of("@hotmail.com", "@gmail.com", "@paypal.com");
|
||||
private static final Set<String> ignoredUsers = Set.of("info", "legal", "contact", "press", "donotreply", "noreply", "no-reply", "admin", "root");
|
||||
|
||||
private final Set<String> words = new HashSet<>();
|
||||
|
||||
public ArtifactKeywords(DocumentLanguageData documentLanguageData) {
|
||||
words = new HashSet<>();
|
||||
|
||||
for (var sent : documentLanguageData.sentences) {
|
||||
for (var word : sent) {
|
||||
String lc = word.wordLowerCase();
|
||||
if (lc.length() < 6
|
||||
|| lc.indexOf('@') < 0
|
||||
|| !mailLikePattern.matcher(lc).matches()) {
|
||||
final String lc = word.wordLowerCase();
|
||||
final int atIdx = lc.indexOf('@');
|
||||
|
||||
if (lc.length() < 6 || atIdx < 0 || !mailLikePattern.matcher(lc).matches()) {
|
||||
continue;
|
||||
}
|
||||
|
||||
words.add(lc);
|
||||
|
||||
String domain = lc.substring(lc.indexOf('@'));
|
||||
String user = lc.substring(0, lc.indexOf('@'));
|
||||
String domain = lc.substring(atIdx);
|
||||
String user = lc.substring(0, atIdx);
|
||||
|
||||
if (!domain.equals("@hotmail.com") && !domain.equals("@gmail.com") && !domain.equals("@paypal.com")) {
|
||||
if (!ignoredDomains.contains(domain)) {
|
||||
words.add(domain.substring(1));
|
||||
words.add(domain);
|
||||
}
|
||||
if (!user.equals("info") && !user.equals("legal") && !user.equals("contact") && !user.equals("donotreply")) {
|
||||
if (!ignoredUsers.contains(user)) {
|
||||
words.add(user);
|
||||
}
|
||||
|
||||
|
Loading…
Reference in New Issue
Block a user