From 810515c08dd28f3c00388b83310f463d56b64726 Mon Sep 17 00:00:00 2001 From: Viktor Lofgren Date: Mon, 10 Apr 2023 13:07:54 +0200 Subject: [PATCH] Clean up artifact extractor. --- .../keyword/extractors/ArtifactKeywords.java | 22 ++++++++++--------- 1 file changed, 12 insertions(+), 10 deletions(-) diff --git a/code/features-convert/keyword-extraction/src/main/java/nu/marginalia/keyword/extractors/ArtifactKeywords.java b/code/features-convert/keyword-extraction/src/main/java/nu/marginalia/keyword/extractors/ArtifactKeywords.java index 1b6e3b34..fd66bed2 100644 --- a/code/features-convert/keyword-extraction/src/main/java/nu/marginalia/keyword/extractors/ArtifactKeywords.java +++ b/code/features-convert/keyword-extraction/src/main/java/nu/marginalia/keyword/extractors/ArtifactKeywords.java @@ -9,30 +9,32 @@ public class ArtifactKeywords { private static final Pattern mailLikePattern = Pattern.compile("[a-zA-Z0-9._\\-]+@[a-zA-Z0-9]+(\\.[a-zA-Z0-9]+)+"); - private final Set words; + private static final Set ignoredDomains = Set.of("@hotmail.com", "@gmail.com", "@paypal.com"); + private static final Set ignoredUsers = Set.of("info", "legal", "contact", "press", "donotreply", "noreply", "no-reply", "admin", "root"); + + private final Set words = new HashSet<>(); public ArtifactKeywords(DocumentLanguageData documentLanguageData) { - words = new HashSet<>(); for (var sent : documentLanguageData.sentences) { for (var word : sent) { - String lc = word.wordLowerCase(); - if (lc.length() < 6 - || lc.indexOf('@') < 0 - || !mailLikePattern.matcher(lc).matches()) { + final String lc = word.wordLowerCase(); + final int atIdx = lc.indexOf('@'); + + if (lc.length() < 6 || atIdx < 0 || !mailLikePattern.matcher(lc).matches()) { continue; } words.add(lc); - String domain = lc.substring(lc.indexOf('@')); - String user = lc.substring(0, lc.indexOf('@')); + String domain = lc.substring(atIdx); + String user = lc.substring(0, atIdx); - if (!domain.equals("@hotmail.com") && !domain.equals("@gmail.com") && !domain.equals("@paypal.com")) { + if (!ignoredDomains.contains(domain)) { words.add(domain.substring(1)); words.add(domain); } - if (!user.equals("info") && !user.equals("legal") && !user.equals("contact") && !user.equals("donotreply")) { + if (!ignoredUsers.contains(user)) { words.add(user); }