From 460dd098b041ff00c1fc4f3b00a95cb9c26c44ea Mon Sep 17 00:00:00 2001
From: vlofgren <vlofgren@gmail.com>
Date: Fri, 12 Aug 2022 13:50:18 +0200
Subject: [PATCH] Add advertisement Feature to search, Add adblock simulation
 to processor, Add filename and email address extraction to processor.

---
 .../processing/DocumentKeywordExtractor.java  | 31 ++++++++++++++++++-
 .../processing/model/DocumentSentence.java    |  4 +++
 2 files changed, 34 insertions(+), 1 deletion(-)
diff --git a/marginalia_nu/src/main/java/nu/marginalia/util/language/processing/DocumentKeywordExtractor.java b/marginalia_nu/src/main/java/nu/marginalia/util/language/processing/DocumentKeywordExtractor.java
index 570d2462..33b88671 100644
--- a/marginalia_nu/src/main/java/nu/marginalia/util/language/processing/DocumentKeywordExtractor.java
+++ b/marginalia_nu/src/main/java/nu/marginalia/util/language/processing/DocumentKeywordExtractor.java
@@ -72,6 +72,8 @@ public class DocumentKeywordExtractor {
         for (var w : topKeywords)
             words.remove(w.word);
 
+        Collection<String> artifacts = getArtifacts(documentLanguageData);
+
         var wordSet = new EdgePageWordSet(
                 createWords(IndexBlock.TitleKeywords, overlappingStems(titleWords, wordsToMatchWithTitle)),
                 createWords(IndexBlock.Topic, subjects),
@@ -79,7 +81,8 @@ public class DocumentKeywordExtractor {
                 createWords(IndexBlock.NamesWords, wordsNamesAll),
                 createWords(IndexBlock.Top, topKeywords),
                 createWords(IndexBlock.Middle, midKeywords),
-                createWords(IndexBlock.Low, lowKeywords)
+                createWords(IndexBlock.Low, lowKeywords),
+                new EdgePageWords(IndexBlock.Artifacts, artifacts)
         );
 
         wordSet.append(IndexBlock.Words, words);
@@ -87,6 +90,32 @@ public class DocumentKeywordExtractor {
         return wordSet;
     }
 
+    private Collection<String> getArtifacts(DocumentLanguageData documentLanguageData) {
+        Set<String> reps = new HashSet<>();
+
+
+        for (var sent : documentLanguageData.sentences) {
+            for (var word : sent) {
+                String lc = word.wordLowerCase();
+                if (lc.matches("[a-zA-Z0-9._\\-]+@[a-zA-Z0-9]+(\\.[a-zA-Z0-9]+)+")) {
+                    reps.add(lc);
+
+                    String domain = lc.substring(lc.indexOf('@'));
+                    String user = lc.substring(0, lc.indexOf('@'));
+
+                    if (!domain.equals("@hotmail.com") && !domain.equals("@gmail.com")  && !domain.equals("@paypal.com")) {
+                        reps.add(domain);
+                    }
+                    if (!user.equals("info") && !user.equals("legal") && !user.equals("contact") && !user.equals("donotreply")) {
+                        reps.add(user);
+                    }
+
+                }
+            }
+        }
+        return reps;
+    }
+
     private List<WordRep> extractTitleWords(DocumentLanguageData documentLanguageData) {
         return Arrays.stream(documentLanguageData.titleSentences).flatMap(sent ->
                 keywordExtractor.getWordsFromSentence(sent).stream().sorted().distinct().map(w -> new WordRep(sent, w)))
diff --git a/marginalia_nu/src/main/java/nu/marginalia/util/language/processing/model/DocumentSentence.java b/marginalia_nu/src/main/java/nu/marginalia/util/language/processing/model/DocumentSentence.java
index 5630939f..b56c5972 100644
--- a/marginalia_nu/src/main/java/nu/marginalia/util/language/processing/model/DocumentSentence.java
+++ b/marginalia_nu/src/main/java/nu/marginalia/util/language/processing/model/DocumentSentence.java
@@ -118,6 +118,10 @@ public class DocumentSentence implements Iterable<DocumentSentence.SentencePos>{
         public String stemmed() { return stemmedWords[pos]; }
         public int separator() { return separators[pos]; }
         public boolean isStopWord() { return DocumentSentence.this.isStopWord(pos); }
+
+        public WordRep rep() {
+            return new WordRep(DocumentSentence.this, new WordSpan(pos, pos+1));
+        }
     }
 }