Better subject extraction and remove unnecessary calculation from DocumentKeywordExtractor

This commit is contained in:
Viktor Lofgren 2023-01-30 09:41:54 +01:00
parent 4d0b444703
commit 8349435ef4
4 changed files with 115 additions and 50 deletions

View File

@ -35,9 +35,7 @@ public class DocumentKeywordExtractor {
List<WordRep> titleWords = extractTitleWords(documentLanguageData);
List<WordRep> wordsNamesAll = nameCounter.count(documentLanguageData, 2);
List<WordRep> subjects = subjectCounter.count(documentLanguageData);
tfIdfCounter.countHisto(keywordMetadata, documentLanguageData);
List<WordRep> subjects = subjectCounter.count(keywordMetadata, documentLanguageData);
for (var rep : titleWords) keywordMetadata.titleKeywords().add(rep.stemmed);
for (var rep : wordsNamesAll) keywordMetadata.namesKeywords().add(rep.stemmed);
@ -59,11 +57,12 @@ public class DocumentKeywordExtractor {
getWordPositions(keywordMetadata, documentLanguageData);
List<WordRep> wordsNamesAll = nameCounter.count(documentLanguageData, 2);
List<WordRep> subjects = subjectCounter.count(documentLanguageData);
List<WordRep> wordsTfIdf = tfIdfCounter.countHisto(keywordMetadata, documentLanguageData);
List<WordRep> wordsNamesAll = nameCounter.count(documentLanguageData, 2);
List<WordRep> subjects = subjectCounter.count(keywordMetadata, documentLanguageData);
for (var rep : titleWords) keywordMetadata.titleKeywords().add(rep.stemmed);
for (var rep : wordsNamesAll) keywordMetadata.namesKeywords().add(rep.stemmed);
for (var rep : subjects) keywordMetadata.subjectKeywords().add(rep.stemmed);
@ -94,7 +93,7 @@ public class DocumentKeywordExtractor {
ret.merge(word.stemmed(), posBit, this::bitwiseOr);
}
for (var span : keywordExtractor.getNames(sent)) {
for (var span : keywordExtractor.getProperNames(sent)) {
ret.merge(sent.constructStemmedWordFromSpan(span), posBit, this::bitwiseOr);
}
}
@ -108,7 +107,7 @@ public class DocumentKeywordExtractor {
ret.merge(word.stemmed(), posBit, this::bitwiseOr);
}
for (var span : keywordExtractor.getNames(sent)) {
for (var span : keywordExtractor.getProperNames(sent)) {
ret.merge(sent.constructStemmedWordFromSpan(span), posBit, this::bitwiseOr);
}
@ -160,7 +159,7 @@ public class DocumentKeywordExtractor {
}
}
for (var names : keywordExtractor.getNames(sent)) {
for (var names : keywordExtractor.getProperNames(sent)) {
var rep = new WordRep(sent, names);
String w = AsciiFlattener.flattenUnicode(rep.word);

View File

@ -7,14 +7,12 @@ import nu.marginalia.util.language.processing.model.tag.WordSeparator;
import java.lang.ref.SoftReference;
import java.util.ArrayList;
import java.util.Collections;
import java.util.List;
import java.util.Set;
public class KeywordExtractor {
public WordSpan[] getNames(DocumentSentence sentence) {
List<WordSpan> spans = new ArrayList<>(sentence.length());
public WordSpan[] getProperNames(DocumentSentence sentence) {
List<WordSpan> spans = new ArrayList<>(2 * sentence.length());
for (int i = 0; i < sentence.length(); i++) {
if (isProperNoun(i, sentence))
@ -57,27 +55,73 @@ public class KeywordExtractor {
return spans.toArray(WordSpan[]::new);
}
public WordSpan[] getKeywordsFromSentence(DocumentSentence sentence) {
if (sentence.keywords != null) {
return sentence.keywords.get();
}
List<WordSpan> spans = new ArrayList<>(sentence.length());
Set<String> topWords = Collections.emptySet();
public WordSpan[] getNouns(DocumentSentence sentence) {
List<WordSpan> spans = new ArrayList<>(2 * sentence.length());
for (int i = 0; i < sentence.length(); i++) {
if (isName(i, sentence, topWords) || isTopAdj(i, sentence, topWords))
if (isNoun(i, sentence))
spans.add(new WordSpan(i, i+1));
}
for (int i = 1; i < sentence.length(); i++) {
if (sentence.separators[i-1] == WordSeparator.COMMA) { continue; }
if (isName(i, sentence, topWords)) {
if (isName(i - 1, sentence, topWords) || isTopAdj(i-1, sentence, topWords))
if (isNoun(i, sentence)
&& (isNoun(i-1, sentence)) || "JJ".equals(sentence.posTags[i-1])) {
spans.add(new WordSpan(i - 1, i + 1));
}
}
for (int i = 2; i < sentence.length(); i++) {
if (sentence.separators[i-2] == WordSeparator.COMMA) { continue; }
if (sentence.separators[i-1] == WordSeparator.COMMA) { i++; continue; }
if ((isNoun(i, sentence))
&& (isJoiner(sentence, i-1) || isNoun(i-1, sentence))
&& (isNoun(i-2, sentence)) || "JJ".equals(sentence.posTags[i-2]))
spans.add(new WordSpan(i-2, i+1));
}
for (int i = 3; i < sentence.length(); i++) {
if (sentence.separators[i-3] == WordSeparator.COMMA) { continue; }
if (sentence.separators[i-2] == WordSeparator.COMMA) { i++; continue; }
if (sentence.separators[i-1] == WordSeparator.COMMA) { i+=2; continue; }
if (isNoun(i, sentence) && (isNoun(i-3, sentence) || "JJ".equals(sentence.posTags[i-3]))) {
if (isNoun(i - 1, sentence) && isNoun(i - 2, sentence))
spans.add(new WordSpan(i-3, i+1));
else if (isJoiner(sentence, i-2) && sentence.posTags[i-1].equals("DT"))
spans.add(new WordSpan(i-3, i+1));
else if ((isJoiner(sentence, i-1) ||isNoun(i-1, sentence))
&& (isJoiner(sentence, i-2)||isNoun(i-2, sentence)))
spans.add(new WordSpan(i-3, i+1));
}
}
return spans.toArray(WordSpan[]::new);
}
public WordSpan[] getKeywordsFromSentence(DocumentSentence sentence) {
if (sentence.keywords != null) {
return sentence.keywords.get();
}
List<WordSpan> spans = new ArrayList<>(2 * sentence.length());
for (int i = 0; i < sentence.length(); i++) {
if (isName(i, sentence) || isTopAdj(i, sentence))
spans.add(new WordSpan(i, i+1));
}
for (int i = 1; i < sentence.length(); i++) {
if (sentence.separators[i-1] == WordSeparator.COMMA) { continue; }
if (isName(i, sentence)) {
if (isName(i - 1, sentence) || isTopAdj(i-1, sentence))
spans.add(new WordSpan(i - 1, i + 1));
}
if (sentence.posTags[i].equals("CD") && isName(i-1, sentence, topWords)) {
if (sentence.posTags[i].equals("CD") && isName(i-1, sentence)) {
spans.add(new WordSpan(i - 1, i + 1));
}
}
@ -86,16 +130,16 @@ public class KeywordExtractor {
if (sentence.separators[i-1] == WordSeparator.COMMA) { i++; continue; }
if (sentence.separators[i-2] == WordSeparator.COMMA) { continue; }
if (isName(i, sentence, topWords)) {
if ((isName(i-1, sentence, topWords) || isTopAdj(i-1, sentence, topWords))
&& (isName(i-2, sentence, topWords) || isTopAdj(i-2, sentence, topWords))) {
if (isName(i, sentence)) {
if ((isName(i-1, sentence) || isTopAdj(i-1, sentence))
&& (isName(i-2, sentence) || isTopAdj(i-2, sentence))) {
spans.add(new WordSpan(i - 2, i + 1));
}
else if ((isProperNoun(i-1, sentence) || isJoiner(sentence, i-1)) && isProperNoun(i-2, sentence)) {
spans.add(new WordSpan(i - 2, i + 1));
}
}
else if (sentence.posTags[i].equals("CD") && isName(i-1, sentence, topWords) && isName(i-2, sentence, topWords)) {
else if (sentence.posTags[i].equals("CD") && isName(i-1, sentence) && isName(i-2, sentence)) {
spans.add(new WordSpan(i - 2, i + 1));
}
}
@ -105,10 +149,10 @@ public class KeywordExtractor {
if (sentence.separators[i-2] == WordSeparator.COMMA) { i++; continue; }
if (sentence.separators[i-3] == WordSeparator.COMMA) { continue; }
if (isName(i, sentence, topWords) &&
(isName(i-1, sentence, topWords) || isTopAdj(i-1, sentence, topWords)) &&
(isName(i-2, sentence, topWords) || isTopAdj(i-2, sentence, topWords)) &&
(isName(i-3, sentence, topWords) || isTopAdj(i-3, sentence, topWords))) {
if (isName(i, sentence) &&
(isName(i-1, sentence) || isTopAdj(i-1, sentence)) &&
(isName(i-2, sentence) || isTopAdj(i-2, sentence)) &&
(isName(i-3, sentence) || isTopAdj(i-3, sentence))) {
spans.add(new WordSpan(i - 3, i + 1));
}
else if (isProperNoun(i, sentence) && isProperNoun(i-3, sentence)) {
@ -134,7 +178,9 @@ public class KeywordExtractor {
public boolean isProperNoun(int i, DocumentSentence sent) {
return "NNP".equals(sent.posTags[i]) || "NNPS".equals(sent.posTags[i]);
}
public boolean isNoun(int i, DocumentSentence sent) {
return sent.posTags[i].startsWith("NN");
}
public boolean isJoiner(DocumentSentence sent, int i) {
if(sent.posTags[i].equals("IN")) {
return true;
@ -183,21 +229,13 @@ public class KeywordExtractor {
return true;
}
private boolean isName(int i, DocumentSentence sentence, Set<String> topWords) {
if (!topWords.isEmpty()) {
String posTag = sentence.posTags[i];
String word = sentence.stemmedWords[i];
return ((topWords.contains(word)) && (posTag.startsWith("N") || "VBN".equals(posTag)) && !sentence.isStopWord(i));
}
private boolean isName(int i, DocumentSentence sentence) {
String posTag = sentence.posTags[i];
return (posTag.startsWith("N") || "VBN".equals(posTag)) && !sentence.isStopWord(i);
return (posTag.startsWith("N") || "VBG".equals(posTag)|| "VBN".equals(posTag)) && !sentence.isStopWord(i);
}
private boolean isTopAdj(int i, DocumentSentence sentence, Set<String> topWords) {
private boolean isTopAdj(int i, DocumentSentence sentence) {
String posTag = sentence.posTags[i];
return (posTag.startsWith("JJ") || posTag.startsWith("R") || posTag.startsWith("VBG"));

View File

@ -20,7 +20,7 @@ public class NameCounter {
for (int i = 0; i < dld.sentences.length; i++) {
DocumentSentence sent = dld.sentences[i];
var keywords = keywordExtractor.getNames(sent);
var keywords = keywordExtractor.getProperNames(sent);
for (var span : keywords) {
if (span.size() <= 1)
continue;

View File

@ -1,9 +1,11 @@
package nu.marginalia.util.language.processing;
import nu.marginalia.util.language.processing.model.DocumentLanguageData;
import nu.marginalia.util.language.processing.model.KeywordMetadata;
import nu.marginalia.util.language.processing.model.WordRep;
import nu.marginalia.util.language.processing.model.WordSpan;
import nu.marginalia.util.language.processing.model.tag.WordSeparator;
import org.apache.commons.lang3.StringUtils;
import java.util.*;
import java.util.stream.Collectors;
@ -23,13 +25,13 @@ public class SubjectCounter {
// Greeks bearing gifts -> Greeks
// Steve McQueen drove fast | cars -> Steve McQueen
public List<WordRep> count(DocumentLanguageData dld) {
public List<WordRep> count(KeywordMetadata keywordMetadata, DocumentLanguageData dld) {
Map<String, Integer> counts = new HashMap<>();
Map<String, Set<WordRep>> instances = new HashMap<>();
for (var sentence : dld.sentences) {
for (WordSpan kw : keywordExtractor.getNames(sentence)) {
for (WordSpan kw : keywordExtractor.getNouns(sentence)) {
if (kw.end + 2 >= sentence.length()) {
continue;
}
@ -46,20 +48,46 @@ public class SubjectCounter {
String stemmed = rep.stemmed;
counts.merge(stemmed, -1, Integer::sum);
instances.computeIfAbsent(stemmed, s -> new HashSet<>()).add(rep);
}
}
}
int best = counts.values().stream().mapToInt(Integer::valueOf).min().orElse(0);
Map<String, Integer> scores = new HashMap<>(instances.size());
for (String stemmed : instances.keySet()) {
scores.put(stemmed, getTermTfIdf(keywordMetadata, stemmed));
}
return counts.entrySet().stream().sorted(Map.Entry.comparingByValue())
.filter(e -> e.getValue()<-2 && e.getValue()<=best*0.75)
return scores.entrySet().stream()
.filter(e -> e.getValue() >= 150)
.flatMap(e -> instances.getOrDefault(e.getKey(), Collections.emptySet()).stream())
.collect(Collectors.toList());
}
private int getTermTfIdf(KeywordMetadata keywordMetadata, String stemmed) {
if (stemmed.contains("_")) {
int sum = 0;
String[] parts = StringUtils.split(stemmed, '_');
if (parts.length == 0) {
return 0;
}
for (String part : parts) {
sum += getTermTfIdf(keywordMetadata, part);
}
return sum / parts.length;
}
var meta = keywordMetadata.wordsTfIdf().get(stemmed);
if (meta != null) {
return meta.tfIdfNormalized();
}
return 0;
}
private boolean isDetOrAdverbOrVerb(String posTag) {
return "DT".equals(posTag) // determinant
|| "RB".equals(posTag) // adverb