mirror of
https://github.com/MarginaliaSearch/MarginaliaSearch.git
synced 2025-02-23 21:18:58 +00:00
Better subject extraction and remove unnecessary calculation from DocumentKeywordExtractor
This commit is contained in:
parent
4d0b444703
commit
8349435ef4
@ -35,9 +35,7 @@ public class DocumentKeywordExtractor {
|
||||
|
||||
List<WordRep> titleWords = extractTitleWords(documentLanguageData);
|
||||
List<WordRep> wordsNamesAll = nameCounter.count(documentLanguageData, 2);
|
||||
List<WordRep> subjects = subjectCounter.count(documentLanguageData);
|
||||
|
||||
tfIdfCounter.countHisto(keywordMetadata, documentLanguageData);
|
||||
List<WordRep> subjects = subjectCounter.count(keywordMetadata, documentLanguageData);
|
||||
|
||||
for (var rep : titleWords) keywordMetadata.titleKeywords().add(rep.stemmed);
|
||||
for (var rep : wordsNamesAll) keywordMetadata.namesKeywords().add(rep.stemmed);
|
||||
@ -59,11 +57,12 @@ public class DocumentKeywordExtractor {
|
||||
|
||||
getWordPositions(keywordMetadata, documentLanguageData);
|
||||
|
||||
List<WordRep> wordsNamesAll = nameCounter.count(documentLanguageData, 2);
|
||||
List<WordRep> subjects = subjectCounter.count(documentLanguageData);
|
||||
|
||||
List<WordRep> wordsTfIdf = tfIdfCounter.countHisto(keywordMetadata, documentLanguageData);
|
||||
|
||||
List<WordRep> wordsNamesAll = nameCounter.count(documentLanguageData, 2);
|
||||
List<WordRep> subjects = subjectCounter.count(keywordMetadata, documentLanguageData);
|
||||
|
||||
|
||||
for (var rep : titleWords) keywordMetadata.titleKeywords().add(rep.stemmed);
|
||||
for (var rep : wordsNamesAll) keywordMetadata.namesKeywords().add(rep.stemmed);
|
||||
for (var rep : subjects) keywordMetadata.subjectKeywords().add(rep.stemmed);
|
||||
@ -94,7 +93,7 @@ public class DocumentKeywordExtractor {
|
||||
ret.merge(word.stemmed(), posBit, this::bitwiseOr);
|
||||
}
|
||||
|
||||
for (var span : keywordExtractor.getNames(sent)) {
|
||||
for (var span : keywordExtractor.getProperNames(sent)) {
|
||||
ret.merge(sent.constructStemmedWordFromSpan(span), posBit, this::bitwiseOr);
|
||||
}
|
||||
}
|
||||
@ -108,7 +107,7 @@ public class DocumentKeywordExtractor {
|
||||
ret.merge(word.stemmed(), posBit, this::bitwiseOr);
|
||||
}
|
||||
|
||||
for (var span : keywordExtractor.getNames(sent)) {
|
||||
for (var span : keywordExtractor.getProperNames(sent)) {
|
||||
ret.merge(sent.constructStemmedWordFromSpan(span), posBit, this::bitwiseOr);
|
||||
}
|
||||
|
||||
@ -160,7 +159,7 @@ public class DocumentKeywordExtractor {
|
||||
}
|
||||
}
|
||||
|
||||
for (var names : keywordExtractor.getNames(sent)) {
|
||||
for (var names : keywordExtractor.getProperNames(sent)) {
|
||||
var rep = new WordRep(sent, names);
|
||||
String w = AsciiFlattener.flattenUnicode(rep.word);
|
||||
|
||||
|
@ -7,14 +7,12 @@ import nu.marginalia.util.language.processing.model.tag.WordSeparator;
|
||||
|
||||
import java.lang.ref.SoftReference;
|
||||
import java.util.ArrayList;
|
||||
import java.util.Collections;
|
||||
import java.util.List;
|
||||
import java.util.Set;
|
||||
|
||||
public class KeywordExtractor {
|
||||
|
||||
public WordSpan[] getNames(DocumentSentence sentence) {
|
||||
List<WordSpan> spans = new ArrayList<>(sentence.length());
|
||||
public WordSpan[] getProperNames(DocumentSentence sentence) {
|
||||
List<WordSpan> spans = new ArrayList<>(2 * sentence.length());
|
||||
|
||||
for (int i = 0; i < sentence.length(); i++) {
|
||||
if (isProperNoun(i, sentence))
|
||||
@ -57,27 +55,73 @@ public class KeywordExtractor {
|
||||
return spans.toArray(WordSpan[]::new);
|
||||
}
|
||||
|
||||
public WordSpan[] getKeywordsFromSentence(DocumentSentence sentence) {
|
||||
if (sentence.keywords != null) {
|
||||
return sentence.keywords.get();
|
||||
}
|
||||
List<WordSpan> spans = new ArrayList<>(sentence.length());
|
||||
|
||||
Set<String> topWords = Collections.emptySet();
|
||||
public WordSpan[] getNouns(DocumentSentence sentence) {
|
||||
List<WordSpan> spans = new ArrayList<>(2 * sentence.length());
|
||||
|
||||
for (int i = 0; i < sentence.length(); i++) {
|
||||
if (isName(i, sentence, topWords) || isTopAdj(i, sentence, topWords))
|
||||
if (isNoun(i, sentence))
|
||||
spans.add(new WordSpan(i, i+1));
|
||||
}
|
||||
|
||||
for (int i = 1; i < sentence.length(); i++) {
|
||||
if (sentence.separators[i-1] == WordSeparator.COMMA) { continue; }
|
||||
|
||||
if (isName(i, sentence, topWords)) {
|
||||
if (isName(i - 1, sentence, topWords) || isTopAdj(i-1, sentence, topWords))
|
||||
if (isNoun(i, sentence)
|
||||
&& (isNoun(i-1, sentence)) || "JJ".equals(sentence.posTags[i-1])) {
|
||||
spans.add(new WordSpan(i - 1, i + 1));
|
||||
}
|
||||
}
|
||||
|
||||
for (int i = 2; i < sentence.length(); i++) {
|
||||
if (sentence.separators[i-2] == WordSeparator.COMMA) { continue; }
|
||||
if (sentence.separators[i-1] == WordSeparator.COMMA) { i++; continue; }
|
||||
|
||||
if ((isNoun(i, sentence))
|
||||
&& (isJoiner(sentence, i-1) || isNoun(i-1, sentence))
|
||||
&& (isNoun(i-2, sentence)) || "JJ".equals(sentence.posTags[i-2]))
|
||||
spans.add(new WordSpan(i-2, i+1));
|
||||
}
|
||||
|
||||
for (int i = 3; i < sentence.length(); i++) {
|
||||
if (sentence.separators[i-3] == WordSeparator.COMMA) { continue; }
|
||||
if (sentence.separators[i-2] == WordSeparator.COMMA) { i++; continue; }
|
||||
if (sentence.separators[i-1] == WordSeparator.COMMA) { i+=2; continue; }
|
||||
|
||||
if (isNoun(i, sentence) && (isNoun(i-3, sentence) || "JJ".equals(sentence.posTags[i-3]))) {
|
||||
if (isNoun(i - 1, sentence) && isNoun(i - 2, sentence))
|
||||
spans.add(new WordSpan(i-3, i+1));
|
||||
else if (isJoiner(sentence, i-2) && sentence.posTags[i-1].equals("DT"))
|
||||
spans.add(new WordSpan(i-3, i+1));
|
||||
else if ((isJoiner(sentence, i-1) ||isNoun(i-1, sentence))
|
||||
&& (isJoiner(sentence, i-2)||isNoun(i-2, sentence)))
|
||||
spans.add(new WordSpan(i-3, i+1));
|
||||
}
|
||||
}
|
||||
|
||||
return spans.toArray(WordSpan[]::new);
|
||||
}
|
||||
|
||||
|
||||
public WordSpan[] getKeywordsFromSentence(DocumentSentence sentence) {
|
||||
if (sentence.keywords != null) {
|
||||
return sentence.keywords.get();
|
||||
}
|
||||
List<WordSpan> spans = new ArrayList<>(2 * sentence.length());
|
||||
|
||||
for (int i = 0; i < sentence.length(); i++) {
|
||||
if (isName(i, sentence) || isTopAdj(i, sentence))
|
||||
spans.add(new WordSpan(i, i+1));
|
||||
}
|
||||
|
||||
for (int i = 1; i < sentence.length(); i++) {
|
||||
if (sentence.separators[i-1] == WordSeparator.COMMA) { continue; }
|
||||
|
||||
if (isName(i, sentence)) {
|
||||
if (isName(i - 1, sentence) || isTopAdj(i-1, sentence))
|
||||
spans.add(new WordSpan(i - 1, i + 1));
|
||||
}
|
||||
if (sentence.posTags[i].equals("CD") && isName(i-1, sentence, topWords)) {
|
||||
if (sentence.posTags[i].equals("CD") && isName(i-1, sentence)) {
|
||||
spans.add(new WordSpan(i - 1, i + 1));
|
||||
}
|
||||
}
|
||||
@ -86,16 +130,16 @@ public class KeywordExtractor {
|
||||
if (sentence.separators[i-1] == WordSeparator.COMMA) { i++; continue; }
|
||||
if (sentence.separators[i-2] == WordSeparator.COMMA) { continue; }
|
||||
|
||||
if (isName(i, sentence, topWords)) {
|
||||
if ((isName(i-1, sentence, topWords) || isTopAdj(i-1, sentence, topWords))
|
||||
&& (isName(i-2, sentence, topWords) || isTopAdj(i-2, sentence, topWords))) {
|
||||
if (isName(i, sentence)) {
|
||||
if ((isName(i-1, sentence) || isTopAdj(i-1, sentence))
|
||||
&& (isName(i-2, sentence) || isTopAdj(i-2, sentence))) {
|
||||
spans.add(new WordSpan(i - 2, i + 1));
|
||||
}
|
||||
else if ((isProperNoun(i-1, sentence) || isJoiner(sentence, i-1)) && isProperNoun(i-2, sentence)) {
|
||||
spans.add(new WordSpan(i - 2, i + 1));
|
||||
}
|
||||
}
|
||||
else if (sentence.posTags[i].equals("CD") && isName(i-1, sentence, topWords) && isName(i-2, sentence, topWords)) {
|
||||
else if (sentence.posTags[i].equals("CD") && isName(i-1, sentence) && isName(i-2, sentence)) {
|
||||
spans.add(new WordSpan(i - 2, i + 1));
|
||||
}
|
||||
}
|
||||
@ -105,10 +149,10 @@ public class KeywordExtractor {
|
||||
if (sentence.separators[i-2] == WordSeparator.COMMA) { i++; continue; }
|
||||
if (sentence.separators[i-3] == WordSeparator.COMMA) { continue; }
|
||||
|
||||
if (isName(i, sentence, topWords) &&
|
||||
(isName(i-1, sentence, topWords) || isTopAdj(i-1, sentence, topWords)) &&
|
||||
(isName(i-2, sentence, topWords) || isTopAdj(i-2, sentence, topWords)) &&
|
||||
(isName(i-3, sentence, topWords) || isTopAdj(i-3, sentence, topWords))) {
|
||||
if (isName(i, sentence) &&
|
||||
(isName(i-1, sentence) || isTopAdj(i-1, sentence)) &&
|
||||
(isName(i-2, sentence) || isTopAdj(i-2, sentence)) &&
|
||||
(isName(i-3, sentence) || isTopAdj(i-3, sentence))) {
|
||||
spans.add(new WordSpan(i - 3, i + 1));
|
||||
}
|
||||
else if (isProperNoun(i, sentence) && isProperNoun(i-3, sentence)) {
|
||||
@ -134,7 +178,9 @@ public class KeywordExtractor {
|
||||
public boolean isProperNoun(int i, DocumentSentence sent) {
|
||||
return "NNP".equals(sent.posTags[i]) || "NNPS".equals(sent.posTags[i]);
|
||||
}
|
||||
|
||||
public boolean isNoun(int i, DocumentSentence sent) {
|
||||
return sent.posTags[i].startsWith("NN");
|
||||
}
|
||||
public boolean isJoiner(DocumentSentence sent, int i) {
|
||||
if(sent.posTags[i].equals("IN")) {
|
||||
return true;
|
||||
@ -183,21 +229,13 @@ public class KeywordExtractor {
|
||||
return true;
|
||||
}
|
||||
|
||||
private boolean isName(int i, DocumentSentence sentence, Set<String> topWords) {
|
||||
if (!topWords.isEmpty()) {
|
||||
String posTag = sentence.posTags[i];
|
||||
String word = sentence.stemmedWords[i];
|
||||
|
||||
return ((topWords.contains(word)) && (posTag.startsWith("N") || "VBN".equals(posTag)) && !sentence.isStopWord(i));
|
||||
}
|
||||
|
||||
|
||||
private boolean isName(int i, DocumentSentence sentence) {
|
||||
String posTag = sentence.posTags[i];
|
||||
|
||||
return (posTag.startsWith("N") || "VBN".equals(posTag)) && !sentence.isStopWord(i);
|
||||
return (posTag.startsWith("N") || "VBG".equals(posTag)|| "VBN".equals(posTag)) && !sentence.isStopWord(i);
|
||||
}
|
||||
|
||||
private boolean isTopAdj(int i, DocumentSentence sentence, Set<String> topWords) {
|
||||
private boolean isTopAdj(int i, DocumentSentence sentence) {
|
||||
String posTag = sentence.posTags[i];
|
||||
|
||||
return (posTag.startsWith("JJ") || posTag.startsWith("R") || posTag.startsWith("VBG"));
|
||||
|
@ -20,7 +20,7 @@ public class NameCounter {
|
||||
|
||||
for (int i = 0; i < dld.sentences.length; i++) {
|
||||
DocumentSentence sent = dld.sentences[i];
|
||||
var keywords = keywordExtractor.getNames(sent);
|
||||
var keywords = keywordExtractor.getProperNames(sent);
|
||||
for (var span : keywords) {
|
||||
if (span.size() <= 1)
|
||||
continue;
|
||||
|
@ -1,9 +1,11 @@
|
||||
package nu.marginalia.util.language.processing;
|
||||
|
||||
import nu.marginalia.util.language.processing.model.DocumentLanguageData;
|
||||
import nu.marginalia.util.language.processing.model.KeywordMetadata;
|
||||
import nu.marginalia.util.language.processing.model.WordRep;
|
||||
import nu.marginalia.util.language.processing.model.WordSpan;
|
||||
import nu.marginalia.util.language.processing.model.tag.WordSeparator;
|
||||
import org.apache.commons.lang3.StringUtils;
|
||||
|
||||
import java.util.*;
|
||||
import java.util.stream.Collectors;
|
||||
@ -23,13 +25,13 @@ public class SubjectCounter {
|
||||
// Greeks bearing gifts -> Greeks
|
||||
// Steve McQueen drove fast | cars -> Steve McQueen
|
||||
|
||||
public List<WordRep> count(DocumentLanguageData dld) {
|
||||
public List<WordRep> count(KeywordMetadata keywordMetadata, DocumentLanguageData dld) {
|
||||
|
||||
Map<String, Integer> counts = new HashMap<>();
|
||||
Map<String, Set<WordRep>> instances = new HashMap<>();
|
||||
|
||||
for (var sentence : dld.sentences) {
|
||||
for (WordSpan kw : keywordExtractor.getNames(sentence)) {
|
||||
for (WordSpan kw : keywordExtractor.getNouns(sentence)) {
|
||||
if (kw.end + 2 >= sentence.length()) {
|
||||
continue;
|
||||
}
|
||||
@ -46,20 +48,46 @@ public class SubjectCounter {
|
||||
|
||||
String stemmed = rep.stemmed;
|
||||
|
||||
counts.merge(stemmed, -1, Integer::sum);
|
||||
instances.computeIfAbsent(stemmed, s -> new HashSet<>()).add(rep);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
int best = counts.values().stream().mapToInt(Integer::valueOf).min().orElse(0);
|
||||
Map<String, Integer> scores = new HashMap<>(instances.size());
|
||||
for (String stemmed : instances.keySet()) {
|
||||
scores.put(stemmed, getTermTfIdf(keywordMetadata, stemmed));
|
||||
}
|
||||
|
||||
return counts.entrySet().stream().sorted(Map.Entry.comparingByValue())
|
||||
.filter(e -> e.getValue()<-2 && e.getValue()<=best*0.75)
|
||||
return scores.entrySet().stream()
|
||||
.filter(e -> e.getValue() >= 150)
|
||||
.flatMap(e -> instances.getOrDefault(e.getKey(), Collections.emptySet()).stream())
|
||||
.collect(Collectors.toList());
|
||||
}
|
||||
|
||||
private int getTermTfIdf(KeywordMetadata keywordMetadata, String stemmed) {
|
||||
if (stemmed.contains("_")) {
|
||||
int sum = 0;
|
||||
String[] parts = StringUtils.split(stemmed, '_');
|
||||
|
||||
if (parts.length == 0) {
|
||||
return 0;
|
||||
}
|
||||
|
||||
for (String part : parts) {
|
||||
sum += getTermTfIdf(keywordMetadata, part);
|
||||
}
|
||||
|
||||
return sum / parts.length;
|
||||
}
|
||||
|
||||
var meta = keywordMetadata.wordsTfIdf().get(stemmed);
|
||||
if (meta != null) {
|
||||
return meta.tfIdfNormalized();
|
||||
}
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
||||
private boolean isDetOrAdverbOrVerb(String posTag) {
|
||||
return "DT".equals(posTag) // determinant
|
||||
|| "RB".equals(posTag) // adverb
|
||||
|
Loading…
Reference in New Issue
Block a user