mirror of
https://github.com/MarginaliaSearch/MarginaliaSearch.git
synced 2025-02-24 05:18:58 +00:00
Add advertisement Feature to search,
Add adblock simulation to processor, Add filename and email address extraction to processor.
This commit is contained in:
parent
d039b138a6
commit
460dd098b0
@ -72,6 +72,8 @@ public class DocumentKeywordExtractor {
|
||||
for (var w : topKeywords)
|
||||
words.remove(w.word);
|
||||
|
||||
Collection<String> artifacts = getArtifacts(documentLanguageData);
|
||||
|
||||
var wordSet = new EdgePageWordSet(
|
||||
createWords(IndexBlock.TitleKeywords, overlappingStems(titleWords, wordsToMatchWithTitle)),
|
||||
createWords(IndexBlock.Topic, subjects),
|
||||
@ -79,7 +81,8 @@ public class DocumentKeywordExtractor {
|
||||
createWords(IndexBlock.NamesWords, wordsNamesAll),
|
||||
createWords(IndexBlock.Top, topKeywords),
|
||||
createWords(IndexBlock.Middle, midKeywords),
|
||||
createWords(IndexBlock.Low, lowKeywords)
|
||||
createWords(IndexBlock.Low, lowKeywords),
|
||||
new EdgePageWords(IndexBlock.Artifacts, artifacts)
|
||||
);
|
||||
|
||||
wordSet.append(IndexBlock.Words, words);
|
||||
@ -87,6 +90,32 @@ public class DocumentKeywordExtractor {
|
||||
return wordSet;
|
||||
}
|
||||
|
||||
private Collection<String> getArtifacts(DocumentLanguageData documentLanguageData) {
|
||||
Set<String> reps = new HashSet<>();
|
||||
|
||||
|
||||
for (var sent : documentLanguageData.sentences) {
|
||||
for (var word : sent) {
|
||||
String lc = word.wordLowerCase();
|
||||
if (lc.matches("[a-zA-Z0-9._\\-]+@[a-zA-Z0-9]+(\\.[a-zA-Z0-9]+)+")) {
|
||||
reps.add(lc);
|
||||
|
||||
String domain = lc.substring(lc.indexOf('@'));
|
||||
String user = lc.substring(0, lc.indexOf('@'));
|
||||
|
||||
if (!domain.equals("@hotmail.com") && !domain.equals("@gmail.com") && !domain.equals("@paypal.com")) {
|
||||
reps.add(domain);
|
||||
}
|
||||
if (!user.equals("info") && !user.equals("legal") && !user.equals("contact") && !user.equals("donotreply")) {
|
||||
reps.add(user);
|
||||
}
|
||||
|
||||
}
|
||||
}
|
||||
}
|
||||
return reps;
|
||||
}
|
||||
|
||||
private List<WordRep> extractTitleWords(DocumentLanguageData documentLanguageData) {
|
||||
return Arrays.stream(documentLanguageData.titleSentences).flatMap(sent ->
|
||||
keywordExtractor.getWordsFromSentence(sent).stream().sorted().distinct().map(w -> new WordRep(sent, w)))
|
||||
|
@ -118,6 +118,10 @@ public class DocumentSentence implements Iterable<DocumentSentence.SentencePos>{
|
||||
public String stemmed() { return stemmedWords[pos]; }
|
||||
public int separator() { return separators[pos]; }
|
||||
public boolean isStopWord() { return DocumentSentence.this.isStopWord(pos); }
|
||||
|
||||
public WordRep rep() {
|
||||
return new WordRep(DocumentSentence.this, new WordSpan(pos, pos+1));
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
|
Loading…
Reference in New Issue
Block a user