Clean up artifact extractor.

This commit is contained in:
Viktor Lofgren 2023-04-10 13:07:54 +02:00
parent 535a51a621
commit 810515c08d

View File

@ -9,30 +9,32 @@ public class ArtifactKeywords {
private static final Pattern mailLikePattern = Pattern.compile("[a-zA-Z0-9._\\-]+@[a-zA-Z0-9]+(\\.[a-zA-Z0-9]+)+");
private final Set<String> words;
private static final Set<String> ignoredDomains = Set.of("@hotmail.com", "@gmail.com", "@paypal.com");
private static final Set<String> ignoredUsers = Set.of("info", "legal", "contact", "press", "donotreply", "noreply", "no-reply", "admin", "root");
private final Set<String> words = new HashSet<>();
public ArtifactKeywords(DocumentLanguageData documentLanguageData) {
words = new HashSet<>();
for (var sent : documentLanguageData.sentences) {
for (var word : sent) {
String lc = word.wordLowerCase();
if (lc.length() < 6
|| lc.indexOf('@') < 0
|| !mailLikePattern.matcher(lc).matches()) {
final String lc = word.wordLowerCase();
final int atIdx = lc.indexOf('@');
if (lc.length() < 6 || atIdx < 0 || !mailLikePattern.matcher(lc).matches()) {
continue;
}
words.add(lc);
String domain = lc.substring(lc.indexOf('@'));
String user = lc.substring(0, lc.indexOf('@'));
String domain = lc.substring(atIdx);
String user = lc.substring(0, atIdx);
if (!domain.equals("@hotmail.com") && !domain.equals("@gmail.com") && !domain.equals("@paypal.com")) {
if (!ignoredDomains.contains(domain)) {
words.add(domain.substring(1));
words.add(domain);
}
if (!user.equals("info") && !user.equals("legal") && !user.equals("contact") && !user.equals("donotreply")) {
if (!ignoredUsers.contains(user)) {
words.add(user);
}