Add advertisement Feature to search,

Add adblock simulation to processor, Add filename and email address extraction to processor.
2025-02-24 05:18:58 +00:00 · 2022-08-12 13:50:18 +02:00 · 2022-08-12 13:50:18 +02:00 · 460dd098b0
commit 460dd098b0
parent d039b138a6
2 changed files with 34 additions and 1 deletions
--- a/marginalia_nu/src/main/java/nu/marginalia/util/language/processing/DocumentKeywordExtractor.java
+++ b/marginalia_nu/src/main/java/nu/marginalia/util/language/processing/DocumentKeywordExtractor.java
@ -72,6 +72,8 @@ public class DocumentKeywordExtractor {
        for (var w : topKeywords)
            words.remove(w.word);

+        Collection<String> artifacts = getArtifacts(documentLanguageData);
+
        var wordSet = new EdgePageWordSet(
                createWords(IndexBlock.TitleKeywords, overlappingStems(titleWords, wordsToMatchWithTitle)),
                createWords(IndexBlock.Topic, subjects),
@ -79,7 +81,8 @@ public class DocumentKeywordExtractor {
                createWords(IndexBlock.NamesWords, wordsNamesAll),
                createWords(IndexBlock.Top, topKeywords),
                createWords(IndexBlock.Middle, midKeywords),
-                createWords(IndexBlock.Low, lowKeywords)
+                createWords(IndexBlock.Low, lowKeywords),
+                new EdgePageWords(IndexBlock.Artifacts, artifacts)
        );

        wordSet.append(IndexBlock.Words, words);
@ -87,6 +90,32 @@ public class DocumentKeywordExtractor {
        return wordSet;
    }

+    private Collection<String> getArtifacts(DocumentLanguageData documentLanguageData) {
+        Set<String> reps = new HashSet<>();
+
+
+        for (var sent : documentLanguageData.sentences) {
+            for (var word : sent) {
+                String lc = word.wordLowerCase();
+                if (lc.matches("[a-zA-Z0-9._\\-]+@[a-zA-Z0-9]+(\\.[a-zA-Z0-9]+)+")) {
+                    reps.add(lc);
+
+                    String domain = lc.substring(lc.indexOf('@'));
+                    String user = lc.substring(0, lc.indexOf('@'));
+
+                    if (!domain.equals("@hotmail.com") && !domain.equals("@gmail.com")  && !domain.equals("@paypal.com")) {
+                        reps.add(domain);
+                    }
+                    if (!user.equals("info") && !user.equals("legal") && !user.equals("contact") && !user.equals("donotreply")) {
+                        reps.add(user);
+                    }
+
+                }
+            }
+        }
+        return reps;
+    }
+
    private List<WordRep> extractTitleWords(DocumentLanguageData documentLanguageData) {
        return Arrays.stream(documentLanguageData.titleSentences).flatMap(sent ->
                keywordExtractor.getWordsFromSentence(sent).stream().sorted().distinct().map(w -> new WordRep(sent, w)))
--- a/marginalia_nu/src/main/java/nu/marginalia/util/language/processing/model/DocumentSentence.java
+++ b/marginalia_nu/src/main/java/nu/marginalia/util/language/processing/model/DocumentSentence.java
@ -118,6 +118,10 @@ public class DocumentSentence implements Iterable<DocumentSentence.SentencePos>{
        public String stemmed() { return stemmedWords[pos]; }
        public int separator() { return separators[pos]; }
        public boolean isStopWord() { return DocumentSentence.this.isStopWord(pos); }
+
+        public WordRep rep() {
+            return new WordRep(DocumentSentence.this, new WordSpan(pos, pos+1));
+        }
    }
 }