Add advertisement Feature to search,

Add adblock simulation to processor,
Add filename and email address extraction to processor.
This commit is contained in:
vlofgren 2022-08-12 13:50:18 +02:00
parent d039b138a6
commit 460dd098b0
2 changed files with 34 additions and 1 deletions

View File

@ -72,6 +72,8 @@ public class DocumentKeywordExtractor {
for (var w : topKeywords) for (var w : topKeywords)
words.remove(w.word); words.remove(w.word);
Collection<String> artifacts = getArtifacts(documentLanguageData);
var wordSet = new EdgePageWordSet( var wordSet = new EdgePageWordSet(
createWords(IndexBlock.TitleKeywords, overlappingStems(titleWords, wordsToMatchWithTitle)), createWords(IndexBlock.TitleKeywords, overlappingStems(titleWords, wordsToMatchWithTitle)),
createWords(IndexBlock.Topic, subjects), createWords(IndexBlock.Topic, subjects),
@ -79,7 +81,8 @@ public class DocumentKeywordExtractor {
createWords(IndexBlock.NamesWords, wordsNamesAll), createWords(IndexBlock.NamesWords, wordsNamesAll),
createWords(IndexBlock.Top, topKeywords), createWords(IndexBlock.Top, topKeywords),
createWords(IndexBlock.Middle, midKeywords), createWords(IndexBlock.Middle, midKeywords),
createWords(IndexBlock.Low, lowKeywords) createWords(IndexBlock.Low, lowKeywords),
new EdgePageWords(IndexBlock.Artifacts, artifacts)
); );
wordSet.append(IndexBlock.Words, words); wordSet.append(IndexBlock.Words, words);
@ -87,6 +90,32 @@ public class DocumentKeywordExtractor {
return wordSet; return wordSet;
} }
private Collection<String> getArtifacts(DocumentLanguageData documentLanguageData) {
Set<String> reps = new HashSet<>();
for (var sent : documentLanguageData.sentences) {
for (var word : sent) {
String lc = word.wordLowerCase();
if (lc.matches("[a-zA-Z0-9._\\-]+@[a-zA-Z0-9]+(\\.[a-zA-Z0-9]+)+")) {
reps.add(lc);
String domain = lc.substring(lc.indexOf('@'));
String user = lc.substring(0, lc.indexOf('@'));
if (!domain.equals("@hotmail.com") && !domain.equals("@gmail.com") && !domain.equals("@paypal.com")) {
reps.add(domain);
}
if (!user.equals("info") && !user.equals("legal") && !user.equals("contact") && !user.equals("donotreply")) {
reps.add(user);
}
}
}
}
return reps;
}
private List<WordRep> extractTitleWords(DocumentLanguageData documentLanguageData) { private List<WordRep> extractTitleWords(DocumentLanguageData documentLanguageData) {
return Arrays.stream(documentLanguageData.titleSentences).flatMap(sent -> return Arrays.stream(documentLanguageData.titleSentences).flatMap(sent ->
keywordExtractor.getWordsFromSentence(sent).stream().sorted().distinct().map(w -> new WordRep(sent, w))) keywordExtractor.getWordsFromSentence(sent).stream().sorted().distinct().map(w -> new WordRep(sent, w)))

View File

@ -118,6 +118,10 @@ public class DocumentSentence implements Iterable<DocumentSentence.SentencePos>{
public String stemmed() { return stemmedWords[pos]; } public String stemmed() { return stemmedWords[pos]; }
public int separator() { return separators[pos]; } public int separator() { return separators[pos]; }
public boolean isStopWord() { return DocumentSentence.this.isStopWord(pos); } public boolean isStopWord() { return DocumentSentence.this.isStopWord(pos); }
public WordRep rep() {
return new WordRep(DocumentSentence.this, new WordSpan(pos, pos+1));
}
} }
} }