diff --git a/marginalia_nu/src/main/java/nu/marginalia/util/language/processing/DocumentKeywordExtractor.java b/marginalia_nu/src/main/java/nu/marginalia/util/language/processing/DocumentKeywordExtractor.java index fddd7e28..cf97302e 100644 --- a/marginalia_nu/src/main/java/nu/marginalia/util/language/processing/DocumentKeywordExtractor.java +++ b/marginalia_nu/src/main/java/nu/marginalia/util/language/processing/DocumentKeywordExtractor.java @@ -35,9 +35,7 @@ public class DocumentKeywordExtractor { List titleWords = extractTitleWords(documentLanguageData); List wordsNamesAll = nameCounter.count(documentLanguageData, 2); - List subjects = subjectCounter.count(documentLanguageData); - - tfIdfCounter.countHisto(keywordMetadata, documentLanguageData); + List subjects = subjectCounter.count(keywordMetadata, documentLanguageData); for (var rep : titleWords) keywordMetadata.titleKeywords().add(rep.stemmed); for (var rep : wordsNamesAll) keywordMetadata.namesKeywords().add(rep.stemmed); @@ -59,11 +57,12 @@ public class DocumentKeywordExtractor { getWordPositions(keywordMetadata, documentLanguageData); - List wordsNamesAll = nameCounter.count(documentLanguageData, 2); - List subjects = subjectCounter.count(documentLanguageData); - List wordsTfIdf = tfIdfCounter.countHisto(keywordMetadata, documentLanguageData); + List wordsNamesAll = nameCounter.count(documentLanguageData, 2); + List subjects = subjectCounter.count(keywordMetadata, documentLanguageData); + + for (var rep : titleWords) keywordMetadata.titleKeywords().add(rep.stemmed); for (var rep : wordsNamesAll) keywordMetadata.namesKeywords().add(rep.stemmed); for (var rep : subjects) keywordMetadata.subjectKeywords().add(rep.stemmed); @@ -94,7 +93,7 @@ public class DocumentKeywordExtractor { ret.merge(word.stemmed(), posBit, this::bitwiseOr); } - for (var span : keywordExtractor.getNames(sent)) { + for (var span : keywordExtractor.getProperNames(sent)) { ret.merge(sent.constructStemmedWordFromSpan(span), posBit, this::bitwiseOr); } } @@ -108,7 +107,7 @@ public class DocumentKeywordExtractor { ret.merge(word.stemmed(), posBit, this::bitwiseOr); } - for (var span : keywordExtractor.getNames(sent)) { + for (var span : keywordExtractor.getProperNames(sent)) { ret.merge(sent.constructStemmedWordFromSpan(span), posBit, this::bitwiseOr); } @@ -160,7 +159,7 @@ public class DocumentKeywordExtractor { } } - for (var names : keywordExtractor.getNames(sent)) { + for (var names : keywordExtractor.getProperNames(sent)) { var rep = new WordRep(sent, names); String w = AsciiFlattener.flattenUnicode(rep.word); diff --git a/marginalia_nu/src/main/java/nu/marginalia/util/language/processing/KeywordExtractor.java b/marginalia_nu/src/main/java/nu/marginalia/util/language/processing/KeywordExtractor.java index 7e56830e..8673ac4c 100644 --- a/marginalia_nu/src/main/java/nu/marginalia/util/language/processing/KeywordExtractor.java +++ b/marginalia_nu/src/main/java/nu/marginalia/util/language/processing/KeywordExtractor.java @@ -7,14 +7,12 @@ import nu.marginalia.util.language.processing.model.tag.WordSeparator; import java.lang.ref.SoftReference; import java.util.ArrayList; -import java.util.Collections; import java.util.List; -import java.util.Set; public class KeywordExtractor { - public WordSpan[] getNames(DocumentSentence sentence) { - List spans = new ArrayList<>(sentence.length()); + public WordSpan[] getProperNames(DocumentSentence sentence) { + List spans = new ArrayList<>(2 * sentence.length()); for (int i = 0; i < sentence.length(); i++) { if (isProperNoun(i, sentence)) @@ -57,27 +55,73 @@ public class KeywordExtractor { return spans.toArray(WordSpan[]::new); } - public WordSpan[] getKeywordsFromSentence(DocumentSentence sentence) { - if (sentence.keywords != null) { - return sentence.keywords.get(); - } - List spans = new ArrayList<>(sentence.length()); - Set topWords = Collections.emptySet(); + public WordSpan[] getNouns(DocumentSentence sentence) { + List spans = new ArrayList<>(2 * sentence.length()); for (int i = 0; i < sentence.length(); i++) { - if (isName(i, sentence, topWords) || isTopAdj(i, sentence, topWords)) + if (isNoun(i, sentence)) spans.add(new WordSpan(i, i+1)); } for (int i = 1; i < sentence.length(); i++) { if (sentence.separators[i-1] == WordSeparator.COMMA) { continue; } - if (isName(i, sentence, topWords)) { - if (isName(i - 1, sentence, topWords) || isTopAdj(i-1, sentence, topWords)) + if (isNoun(i, sentence) + && (isNoun(i-1, sentence)) || "JJ".equals(sentence.posTags[i-1])) { + spans.add(new WordSpan(i - 1, i + 1)); + } + } + + for (int i = 2; i < sentence.length(); i++) { + if (sentence.separators[i-2] == WordSeparator.COMMA) { continue; } + if (sentence.separators[i-1] == WordSeparator.COMMA) { i++; continue; } + + if ((isNoun(i, sentence)) + && (isJoiner(sentence, i-1) || isNoun(i-1, sentence)) + && (isNoun(i-2, sentence)) || "JJ".equals(sentence.posTags[i-2])) + spans.add(new WordSpan(i-2, i+1)); + } + + for (int i = 3; i < sentence.length(); i++) { + if (sentence.separators[i-3] == WordSeparator.COMMA) { continue; } + if (sentence.separators[i-2] == WordSeparator.COMMA) { i++; continue; } + if (sentence.separators[i-1] == WordSeparator.COMMA) { i+=2; continue; } + + if (isNoun(i, sentence) && (isNoun(i-3, sentence) || "JJ".equals(sentence.posTags[i-3]))) { + if (isNoun(i - 1, sentence) && isNoun(i - 2, sentence)) + spans.add(new WordSpan(i-3, i+1)); + else if (isJoiner(sentence, i-2) && sentence.posTags[i-1].equals("DT")) + spans.add(new WordSpan(i-3, i+1)); + else if ((isJoiner(sentence, i-1) ||isNoun(i-1, sentence)) + && (isJoiner(sentence, i-2)||isNoun(i-2, sentence))) + spans.add(new WordSpan(i-3, i+1)); + } + } + + return spans.toArray(WordSpan[]::new); + } + + + public WordSpan[] getKeywordsFromSentence(DocumentSentence sentence) { + if (sentence.keywords != null) { + return sentence.keywords.get(); + } + List spans = new ArrayList<>(2 * sentence.length()); + + for (int i = 0; i < sentence.length(); i++) { + if (isName(i, sentence) || isTopAdj(i, sentence)) + spans.add(new WordSpan(i, i+1)); + } + + for (int i = 1; i < sentence.length(); i++) { + if (sentence.separators[i-1] == WordSeparator.COMMA) { continue; } + + if (isName(i, sentence)) { + if (isName(i - 1, sentence) || isTopAdj(i-1, sentence)) spans.add(new WordSpan(i - 1, i + 1)); } - if (sentence.posTags[i].equals("CD") && isName(i-1, sentence, topWords)) { + if (sentence.posTags[i].equals("CD") && isName(i-1, sentence)) { spans.add(new WordSpan(i - 1, i + 1)); } } @@ -86,16 +130,16 @@ public class KeywordExtractor { if (sentence.separators[i-1] == WordSeparator.COMMA) { i++; continue; } if (sentence.separators[i-2] == WordSeparator.COMMA) { continue; } - if (isName(i, sentence, topWords)) { - if ((isName(i-1, sentence, topWords) || isTopAdj(i-1, sentence, topWords)) - && (isName(i-2, sentence, topWords) || isTopAdj(i-2, sentence, topWords))) { + if (isName(i, sentence)) { + if ((isName(i-1, sentence) || isTopAdj(i-1, sentence)) + && (isName(i-2, sentence) || isTopAdj(i-2, sentence))) { spans.add(new WordSpan(i - 2, i + 1)); } else if ((isProperNoun(i-1, sentence) || isJoiner(sentence, i-1)) && isProperNoun(i-2, sentence)) { spans.add(new WordSpan(i - 2, i + 1)); } } - else if (sentence.posTags[i].equals("CD") && isName(i-1, sentence, topWords) && isName(i-2, sentence, topWords)) { + else if (sentence.posTags[i].equals("CD") && isName(i-1, sentence) && isName(i-2, sentence)) { spans.add(new WordSpan(i - 2, i + 1)); } } @@ -105,10 +149,10 @@ public class KeywordExtractor { if (sentence.separators[i-2] == WordSeparator.COMMA) { i++; continue; } if (sentence.separators[i-3] == WordSeparator.COMMA) { continue; } - if (isName(i, sentence, topWords) && - (isName(i-1, sentence, topWords) || isTopAdj(i-1, sentence, topWords)) && - (isName(i-2, sentence, topWords) || isTopAdj(i-2, sentence, topWords)) && - (isName(i-3, sentence, topWords) || isTopAdj(i-3, sentence, topWords))) { + if (isName(i, sentence) && + (isName(i-1, sentence) || isTopAdj(i-1, sentence)) && + (isName(i-2, sentence) || isTopAdj(i-2, sentence)) && + (isName(i-3, sentence) || isTopAdj(i-3, sentence))) { spans.add(new WordSpan(i - 3, i + 1)); } else if (isProperNoun(i, sentence) && isProperNoun(i-3, sentence)) { @@ -134,7 +178,9 @@ public class KeywordExtractor { public boolean isProperNoun(int i, DocumentSentence sent) { return "NNP".equals(sent.posTags[i]) || "NNPS".equals(sent.posTags[i]); } - + public boolean isNoun(int i, DocumentSentence sent) { + return sent.posTags[i].startsWith("NN"); + } public boolean isJoiner(DocumentSentence sent, int i) { if(sent.posTags[i].equals("IN")) { return true; @@ -183,21 +229,13 @@ public class KeywordExtractor { return true; } - private boolean isName(int i, DocumentSentence sentence, Set topWords) { - if (!topWords.isEmpty()) { - String posTag = sentence.posTags[i]; - String word = sentence.stemmedWords[i]; - - return ((topWords.contains(word)) && (posTag.startsWith("N") || "VBN".equals(posTag)) && !sentence.isStopWord(i)); - } - - + private boolean isName(int i, DocumentSentence sentence) { String posTag = sentence.posTags[i]; - return (posTag.startsWith("N") || "VBN".equals(posTag)) && !sentence.isStopWord(i); + return (posTag.startsWith("N") || "VBG".equals(posTag)|| "VBN".equals(posTag)) && !sentence.isStopWord(i); } - private boolean isTopAdj(int i, DocumentSentence sentence, Set topWords) { + private boolean isTopAdj(int i, DocumentSentence sentence) { String posTag = sentence.posTags[i]; return (posTag.startsWith("JJ") || posTag.startsWith("R") || posTag.startsWith("VBG")); diff --git a/marginalia_nu/src/main/java/nu/marginalia/util/language/processing/NameCounter.java b/marginalia_nu/src/main/java/nu/marginalia/util/language/processing/NameCounter.java index 476b7b5d..221790d6 100644 --- a/marginalia_nu/src/main/java/nu/marginalia/util/language/processing/NameCounter.java +++ b/marginalia_nu/src/main/java/nu/marginalia/util/language/processing/NameCounter.java @@ -20,7 +20,7 @@ public class NameCounter { for (int i = 0; i < dld.sentences.length; i++) { DocumentSentence sent = dld.sentences[i]; - var keywords = keywordExtractor.getNames(sent); + var keywords = keywordExtractor.getProperNames(sent); for (var span : keywords) { if (span.size() <= 1) continue; diff --git a/marginalia_nu/src/main/java/nu/marginalia/util/language/processing/SubjectCounter.java b/marginalia_nu/src/main/java/nu/marginalia/util/language/processing/SubjectCounter.java index af774898..b0f46f30 100644 --- a/marginalia_nu/src/main/java/nu/marginalia/util/language/processing/SubjectCounter.java +++ b/marginalia_nu/src/main/java/nu/marginalia/util/language/processing/SubjectCounter.java @@ -1,9 +1,11 @@ package nu.marginalia.util.language.processing; import nu.marginalia.util.language.processing.model.DocumentLanguageData; +import nu.marginalia.util.language.processing.model.KeywordMetadata; import nu.marginalia.util.language.processing.model.WordRep; import nu.marginalia.util.language.processing.model.WordSpan; import nu.marginalia.util.language.processing.model.tag.WordSeparator; +import org.apache.commons.lang3.StringUtils; import java.util.*; import java.util.stream.Collectors; @@ -23,13 +25,13 @@ public class SubjectCounter { // Greeks bearing gifts -> Greeks // Steve McQueen drove fast | cars -> Steve McQueen - public List count(DocumentLanguageData dld) { + public List count(KeywordMetadata keywordMetadata, DocumentLanguageData dld) { Map counts = new HashMap<>(); Map> instances = new HashMap<>(); for (var sentence : dld.sentences) { - for (WordSpan kw : keywordExtractor.getNames(sentence)) { + for (WordSpan kw : keywordExtractor.getNouns(sentence)) { if (kw.end + 2 >= sentence.length()) { continue; } @@ -46,20 +48,46 @@ public class SubjectCounter { String stemmed = rep.stemmed; - counts.merge(stemmed, -1, Integer::sum); instances.computeIfAbsent(stemmed, s -> new HashSet<>()).add(rep); } } } - int best = counts.values().stream().mapToInt(Integer::valueOf).min().orElse(0); + Map scores = new HashMap<>(instances.size()); + for (String stemmed : instances.keySet()) { + scores.put(stemmed, getTermTfIdf(keywordMetadata, stemmed)); + } - return counts.entrySet().stream().sorted(Map.Entry.comparingByValue()) - .filter(e -> e.getValue()<-2 && e.getValue()<=best*0.75) + return scores.entrySet().stream() + .filter(e -> e.getValue() >= 150) .flatMap(e -> instances.getOrDefault(e.getKey(), Collections.emptySet()).stream()) .collect(Collectors.toList()); } + private int getTermTfIdf(KeywordMetadata keywordMetadata, String stemmed) { + if (stemmed.contains("_")) { + int sum = 0; + String[] parts = StringUtils.split(stemmed, '_'); + + if (parts.length == 0) { + return 0; + } + + for (String part : parts) { + sum += getTermTfIdf(keywordMetadata, part); + } + + return sum / parts.length; + } + + var meta = keywordMetadata.wordsTfIdf().get(stemmed); + if (meta != null) { + return meta.tfIdfNormalized(); + } + + return 0; + } + private boolean isDetOrAdverbOrVerb(String posTag) { return "DT".equals(posTag) // determinant || "RB".equals(posTag) // adverb