Clean up artifact extractor.

This commit is contained in:
Viktor Lofgren 2023-04-10 13:07:54 +02:00
parent 535a51a621
commit 810515c08d

View File

@ -9,30 +9,32 @@ public class ArtifactKeywords {
private static final Pattern mailLikePattern = Pattern.compile("[a-zA-Z0-9._\\-]+@[a-zA-Z0-9]+(\\.[a-zA-Z0-9]+)+"); private static final Pattern mailLikePattern = Pattern.compile("[a-zA-Z0-9._\\-]+@[a-zA-Z0-9]+(\\.[a-zA-Z0-9]+)+");
private final Set<String> words; private static final Set<String> ignoredDomains = Set.of("@hotmail.com", "@gmail.com", "@paypal.com");
private static final Set<String> ignoredUsers = Set.of("info", "legal", "contact", "press", "donotreply", "noreply", "no-reply", "admin", "root");
private final Set<String> words = new HashSet<>();
public ArtifactKeywords(DocumentLanguageData documentLanguageData) { public ArtifactKeywords(DocumentLanguageData documentLanguageData) {
words = new HashSet<>();
for (var sent : documentLanguageData.sentences) { for (var sent : documentLanguageData.sentences) {
for (var word : sent) { for (var word : sent) {
String lc = word.wordLowerCase(); final String lc = word.wordLowerCase();
if (lc.length() < 6 final int atIdx = lc.indexOf('@');
|| lc.indexOf('@') < 0
|| !mailLikePattern.matcher(lc).matches()) { if (lc.length() < 6 || atIdx < 0 || !mailLikePattern.matcher(lc).matches()) {
continue; continue;
} }
words.add(lc); words.add(lc);
String domain = lc.substring(lc.indexOf('@')); String domain = lc.substring(atIdx);
String user = lc.substring(0, lc.indexOf('@')); String user = lc.substring(0, atIdx);
if (!domain.equals("@hotmail.com") && !domain.equals("@gmail.com") && !domain.equals("@paypal.com")) { if (!ignoredDomains.contains(domain)) {
words.add(domain.substring(1)); words.add(domain.substring(1));
words.add(domain); words.add(domain);
} }
if (!user.equals("info") && !user.equals("legal") && !user.equals("contact") && !user.equals("donotreply")) { if (!ignoredUsers.contains(user)) {
words.add(user); words.add(user);
} }