From 460dd098b041ff00c1fc4f3b00a95cb9c26c44ea Mon Sep 17 00:00:00 2001 From: vlofgren Date: Fri, 12 Aug 2022 13:50:18 +0200 Subject: [PATCH] Add advertisement Feature to search, Add adblock simulation to processor, Add filename and email address extraction to processor. --- .../processing/DocumentKeywordExtractor.java | 31 ++++++++++++++++++- .../processing/model/DocumentSentence.java | 4 +++ 2 files changed, 34 insertions(+), 1 deletion(-) diff --git a/marginalia_nu/src/main/java/nu/marginalia/util/language/processing/DocumentKeywordExtractor.java b/marginalia_nu/src/main/java/nu/marginalia/util/language/processing/DocumentKeywordExtractor.java index 570d2462..33b88671 100644 --- a/marginalia_nu/src/main/java/nu/marginalia/util/language/processing/DocumentKeywordExtractor.java +++ b/marginalia_nu/src/main/java/nu/marginalia/util/language/processing/DocumentKeywordExtractor.java @@ -72,6 +72,8 @@ public class DocumentKeywordExtractor { for (var w : topKeywords) words.remove(w.word); + Collection artifacts = getArtifacts(documentLanguageData); + var wordSet = new EdgePageWordSet( createWords(IndexBlock.TitleKeywords, overlappingStems(titleWords, wordsToMatchWithTitle)), createWords(IndexBlock.Topic, subjects), @@ -79,7 +81,8 @@ public class DocumentKeywordExtractor { createWords(IndexBlock.NamesWords, wordsNamesAll), createWords(IndexBlock.Top, topKeywords), createWords(IndexBlock.Middle, midKeywords), - createWords(IndexBlock.Low, lowKeywords) + createWords(IndexBlock.Low, lowKeywords), + new EdgePageWords(IndexBlock.Artifacts, artifacts) ); wordSet.append(IndexBlock.Words, words); @@ -87,6 +90,32 @@ public class DocumentKeywordExtractor { return wordSet; } + private Collection getArtifacts(DocumentLanguageData documentLanguageData) { + Set reps = new HashSet<>(); + + + for (var sent : documentLanguageData.sentences) { + for (var word : sent) { + String lc = word.wordLowerCase(); + if (lc.matches("[a-zA-Z0-9._\\-]+@[a-zA-Z0-9]+(\\.[a-zA-Z0-9]+)+")) { + reps.add(lc); + + String domain = lc.substring(lc.indexOf('@')); + String user = lc.substring(0, lc.indexOf('@')); + + if (!domain.equals("@hotmail.com") && !domain.equals("@gmail.com") && !domain.equals("@paypal.com")) { + reps.add(domain); + } + if (!user.equals("info") && !user.equals("legal") && !user.equals("contact") && !user.equals("donotreply")) { + reps.add(user); + } + + } + } + } + return reps; + } + private List extractTitleWords(DocumentLanguageData documentLanguageData) { return Arrays.stream(documentLanguageData.titleSentences).flatMap(sent -> keywordExtractor.getWordsFromSentence(sent).stream().sorted().distinct().map(w -> new WordRep(sent, w))) diff --git a/marginalia_nu/src/main/java/nu/marginalia/util/language/processing/model/DocumentSentence.java b/marginalia_nu/src/main/java/nu/marginalia/util/language/processing/model/DocumentSentence.java index 5630939f..b56c5972 100644 --- a/marginalia_nu/src/main/java/nu/marginalia/util/language/processing/model/DocumentSentence.java +++ b/marginalia_nu/src/main/java/nu/marginalia/util/language/processing/model/DocumentSentence.java @@ -118,6 +118,10 @@ public class DocumentSentence implements Iterable{ public String stemmed() { return stemmedWords[pos]; } public int separator() { return separators[pos]; } public boolean isStopWord() { return DocumentSentence.this.isStopWord(pos); } + + public WordRep rep() { + return new WordRep(DocumentSentence.this, new WordSpan(pos, pos+1)); + } } }