Better subject extraction and remove unnecessary calculation from DocumentKeywordExtractor

2025-02-23 21:18:58 +00:00 · 2023-01-30 09:41:54 +01:00 · 2023-01-30 09:41:54 +01:00 · 8349435ef4
commit 8349435ef4
parent 4d0b444703
4 changed files with 115 additions and 50 deletions
--- a/marginalia_nu/src/main/java/nu/marginalia/util/language/processing/DocumentKeywordExtractor.java
+++ b/marginalia_nu/src/main/java/nu/marginalia/util/language/processing/DocumentKeywordExtractor.java
@ -35,9 +35,7 @@ public class DocumentKeywordExtractor {

        List<WordRep> titleWords = extractTitleWords(documentLanguageData);
        List<WordRep> wordsNamesAll = nameCounter.count(documentLanguageData, 2);
-        List<WordRep> subjects = subjectCounter.count(documentLanguageData);
-
-        tfIdfCounter.countHisto(keywordMetadata, documentLanguageData);
+        List<WordRep> subjects = subjectCounter.count(keywordMetadata, documentLanguageData);

        for (var rep : titleWords) keywordMetadata.titleKeywords().add(rep.stemmed);
        for (var rep : wordsNamesAll) keywordMetadata.namesKeywords().add(rep.stemmed);
@ -59,11 +57,12 @@ public class DocumentKeywordExtractor {

        getWordPositions(keywordMetadata, documentLanguageData);

-        List<WordRep> wordsNamesAll = nameCounter.count(documentLanguageData, 2);
-        List<WordRep> subjects = subjectCounter.count(documentLanguageData);
-
        List<WordRep> wordsTfIdf = tfIdfCounter.countHisto(keywordMetadata, documentLanguageData);

+        List<WordRep> wordsNamesAll = nameCounter.count(documentLanguageData, 2);
+        List<WordRep> subjects = subjectCounter.count(keywordMetadata, documentLanguageData);
+
+
        for (var rep : titleWords) keywordMetadata.titleKeywords().add(rep.stemmed);
        for (var rep : wordsNamesAll) keywordMetadata.namesKeywords().add(rep.stemmed);
        for (var rep : subjects) keywordMetadata.subjectKeywords().add(rep.stemmed);
@ -94,7 +93,7 @@ public class DocumentKeywordExtractor {
                ret.merge(word.stemmed(), posBit, this::bitwiseOr);
            }

-            for (var span : keywordExtractor.getNames(sent)) {
+            for (var span : keywordExtractor.getProperNames(sent)) {
                ret.merge(sent.constructStemmedWordFromSpan(span), posBit, this::bitwiseOr);
            }
        }
@ -108,7 +107,7 @@ public class DocumentKeywordExtractor {
                ret.merge(word.stemmed(), posBit, this::bitwiseOr);
            }

-            for (var span : keywordExtractor.getNames(sent)) {
+            for (var span : keywordExtractor.getProperNames(sent)) {
                ret.merge(sent.constructStemmedWordFromSpan(span), posBit, this::bitwiseOr);
            }

@ -160,7 +159,7 @@ public class DocumentKeywordExtractor {
                }
            }

-            for (var names : keywordExtractor.getNames(sent)) {
+            for (var names : keywordExtractor.getProperNames(sent)) {
                var rep = new WordRep(sent, names);
                String w = AsciiFlattener.flattenUnicode(rep.word);

--- a/marginalia_nu/src/main/java/nu/marginalia/util/language/processing/KeywordExtractor.java
+++ b/marginalia_nu/src/main/java/nu/marginalia/util/language/processing/KeywordExtractor.java
@ -7,14 +7,12 @@ import nu.marginalia.util.language.processing.model.tag.WordSeparator;

 import java.lang.ref.SoftReference;
 import java.util.ArrayList;
-import java.util.Collections;
 import java.util.List;
-import java.util.Set;

 public class KeywordExtractor {

-    public WordSpan[] getNames(DocumentSentence sentence) {
-        List<WordSpan> spans = new ArrayList<>(sentence.length());
+    public WordSpan[] getProperNames(DocumentSentence sentence) {
+        List<WordSpan> spans = new ArrayList<>(2 * sentence.length());

        for (int i = 0; i < sentence.length(); i++) {
            if (isProperNoun(i, sentence))
@ -57,27 +55,73 @@ public class KeywordExtractor {
        return spans.toArray(WordSpan[]::new);
    }

-    public WordSpan[] getKeywordsFromSentence(DocumentSentence sentence) {
-        if (sentence.keywords != null) {
-            return sentence.keywords.get();
-        }
-        List<WordSpan> spans = new ArrayList<>(sentence.length());

-        Set<String> topWords = Collections.emptySet();
+    public WordSpan[] getNouns(DocumentSentence sentence) {
+        List<WordSpan> spans = new ArrayList<>(2 * sentence.length());

        for (int i = 0; i < sentence.length(); i++) {
-            if (isName(i, sentence, topWords) || isTopAdj(i, sentence, topWords))
+            if (isNoun(i, sentence))
                spans.add(new WordSpan(i, i+1));
        }

        for (int i = 1; i < sentence.length(); i++) {
            if (sentence.separators[i-1] == WordSeparator.COMMA) { continue; }

-            if (isName(i, sentence, topWords)) {
-                if (isName(i - 1, sentence, topWords) || isTopAdj(i-1, sentence, topWords))
+            if (isNoun(i, sentence)
+                    && (isNoun(i-1, sentence)) || "JJ".equals(sentence.posTags[i-1])) {
+                spans.add(new WordSpan(i - 1, i + 1));
+            }
+        }
+
+        for (int i = 2; i < sentence.length(); i++) {
+            if (sentence.separators[i-2] == WordSeparator.COMMA) { continue; }
+            if (sentence.separators[i-1] == WordSeparator.COMMA) { i++; continue; }
+
+            if ((isNoun(i, sentence))
+                    && (isJoiner(sentence, i-1) || isNoun(i-1, sentence))
+                    && (isNoun(i-2, sentence)) || "JJ".equals(sentence.posTags[i-2]))
+                spans.add(new WordSpan(i-2, i+1));
+        }
+
+        for (int i = 3; i < sentence.length(); i++) {
+            if (sentence.separators[i-3] == WordSeparator.COMMA) { continue; }
+            if (sentence.separators[i-2] == WordSeparator.COMMA) { i++; continue; }
+            if (sentence.separators[i-1] == WordSeparator.COMMA) { i+=2; continue; }
+
+            if (isNoun(i, sentence) && (isNoun(i-3, sentence) || "JJ".equals(sentence.posTags[i-3]))) {
+                if (isNoun(i - 1, sentence) && isNoun(i - 2, sentence))
+                    spans.add(new WordSpan(i-3, i+1));
+                else if (isJoiner(sentence, i-2) && sentence.posTags[i-1].equals("DT"))
+                    spans.add(new WordSpan(i-3, i+1));
+                else if ((isJoiner(sentence, i-1) ||isNoun(i-1, sentence))
+                        && (isJoiner(sentence, i-2)||isNoun(i-2, sentence)))
+                    spans.add(new WordSpan(i-3, i+1));
+            }
+        }
+
+        return spans.toArray(WordSpan[]::new);
+    }
+
+
+    public WordSpan[] getKeywordsFromSentence(DocumentSentence sentence) {
+        if (sentence.keywords != null) {
+            return sentence.keywords.get();
+        }
+        List<WordSpan> spans = new ArrayList<>(2 * sentence.length());
+
+        for (int i = 0; i < sentence.length(); i++) {
+            if (isName(i, sentence) || isTopAdj(i, sentence))
+                spans.add(new WordSpan(i, i+1));
+        }
+
+        for (int i = 1; i < sentence.length(); i++) {
+            if (sentence.separators[i-1] == WordSeparator.COMMA) { continue; }
+
+            if (isName(i, sentence)) {
+                if (isName(i - 1, sentence) || isTopAdj(i-1, sentence))
                    spans.add(new WordSpan(i - 1, i + 1));
            }
-            if (sentence.posTags[i].equals("CD") &&  isName(i-1, sentence, topWords)) {
+            if (sentence.posTags[i].equals("CD") &&  isName(i-1, sentence)) {
                spans.add(new WordSpan(i - 1, i + 1));
            }
        }
@ -86,16 +130,16 @@ public class KeywordExtractor {
            if (sentence.separators[i-1] == WordSeparator.COMMA) { i++; continue; }
            if (sentence.separators[i-2] == WordSeparator.COMMA) { continue; }

-            if (isName(i, sentence, topWords)) {
-                if ((isName(i-1, sentence, topWords) || isTopAdj(i-1, sentence, topWords))
-                        && (isName(i-2, sentence, topWords) || isTopAdj(i-2, sentence, topWords))) {
+            if (isName(i, sentence)) {
+                if ((isName(i-1, sentence) || isTopAdj(i-1, sentence))
+                        && (isName(i-2, sentence) || isTopAdj(i-2, sentence))) {
                    spans.add(new WordSpan(i - 2, i + 1));
                }
                else if ((isProperNoun(i-1, sentence) || isJoiner(sentence, i-1)) && isProperNoun(i-2, sentence)) {
                    spans.add(new WordSpan(i - 2, i + 1));
                }
            }
-            else if (sentence.posTags[i].equals("CD") && isName(i-1, sentence, topWords) && isName(i-2, sentence, topWords)) {
+            else if (sentence.posTags[i].equals("CD") && isName(i-1, sentence) && isName(i-2, sentence)) {
                spans.add(new WordSpan(i - 2, i + 1));
            }
        }
@ -105,10 +149,10 @@ public class KeywordExtractor {
            if (sentence.separators[i-2] == WordSeparator.COMMA) { i++; continue; }
            if (sentence.separators[i-3] == WordSeparator.COMMA) { continue; }

-            if (isName(i, sentence, topWords) &&
-                    (isName(i-1, sentence, topWords) || isTopAdj(i-1, sentence, topWords)) &&
-                    (isName(i-2, sentence, topWords) || isTopAdj(i-2, sentence, topWords)) &&
-                    (isName(i-3, sentence, topWords) || isTopAdj(i-3, sentence, topWords))) {
+            if (isName(i, sentence) &&
+                    (isName(i-1, sentence) || isTopAdj(i-1, sentence)) &&
+                    (isName(i-2, sentence) || isTopAdj(i-2, sentence)) &&
+                    (isName(i-3, sentence) || isTopAdj(i-3, sentence))) {
                spans.add(new WordSpan(i - 3, i + 1));
            }
            else if (isProperNoun(i, sentence) && isProperNoun(i-3, sentence)) {
@ -134,7 +178,9 @@ public class KeywordExtractor {
    public boolean isProperNoun(int i, DocumentSentence sent) {
        return "NNP".equals(sent.posTags[i]) || "NNPS".equals(sent.posTags[i]);
    }
-
+    public boolean isNoun(int i, DocumentSentence sent) {
+        return sent.posTags[i].startsWith("NN");
+    }
    public boolean isJoiner(DocumentSentence sent, int i) {
        if(sent.posTags[i].equals("IN")) {
            return true;
@ -183,21 +229,13 @@ public class KeywordExtractor {
        return true;
    }

-    private boolean isName(int i, DocumentSentence sentence, Set<String> topWords) {
-        if (!topWords.isEmpty()) {
-            String posTag = sentence.posTags[i];
-            String word = sentence.stemmedWords[i];
-
-            return ((topWords.contains(word)) && (posTag.startsWith("N") || "VBN".equals(posTag)) && !sentence.isStopWord(i));
-        }
-
-
+    private boolean isName(int i, DocumentSentence sentence) {
        String posTag = sentence.posTags[i];

-        return (posTag.startsWith("N") || "VBN".equals(posTag)) && !sentence.isStopWord(i);
+        return (posTag.startsWith("N") || "VBG".equals(posTag)|| "VBN".equals(posTag)) && !sentence.isStopWord(i);
    }

-    private boolean isTopAdj(int i, DocumentSentence sentence, Set<String> topWords) {
+    private boolean isTopAdj(int i, DocumentSentence sentence) {
        String posTag = sentence.posTags[i];

        return (posTag.startsWith("JJ") || posTag.startsWith("R") || posTag.startsWith("VBG"));
--- a/marginalia_nu/src/main/java/nu/marginalia/util/language/processing/NameCounter.java
+++ b/marginalia_nu/src/main/java/nu/marginalia/util/language/processing/NameCounter.java
@ -20,7 +20,7 @@ public class NameCounter {

        for (int i = 0; i < dld.sentences.length; i++) {
            DocumentSentence sent = dld.sentences[i];
-            var keywords = keywordExtractor.getNames(sent);
+            var keywords = keywordExtractor.getProperNames(sent);
            for (var span : keywords) {
                if (span.size() <= 1)
                    continue;
--- a/marginalia_nu/src/main/java/nu/marginalia/util/language/processing/SubjectCounter.java
+++ b/marginalia_nu/src/main/java/nu/marginalia/util/language/processing/SubjectCounter.java
@ -1,9 +1,11 @@
 package nu.marginalia.util.language.processing;

 import nu.marginalia.util.language.processing.model.DocumentLanguageData;
+import nu.marginalia.util.language.processing.model.KeywordMetadata;
 import nu.marginalia.util.language.processing.model.WordRep;
 import nu.marginalia.util.language.processing.model.WordSpan;
 import nu.marginalia.util.language.processing.model.tag.WordSeparator;
+import org.apache.commons.lang3.StringUtils;

 import java.util.*;
 import java.util.stream.Collectors;
@ -23,13 +25,13 @@ public class SubjectCounter {
    // Greeks bearing gifts -> Greeks
    // Steve McQueen drove fast | cars -> Steve McQueen

-    public List<WordRep> count(DocumentLanguageData dld) {
+    public List<WordRep> count(KeywordMetadata keywordMetadata, DocumentLanguageData dld) {

        Map<String, Integer> counts = new HashMap<>();
        Map<String, Set<WordRep>> instances = new HashMap<>();

        for (var sentence : dld.sentences) {
-            for (WordSpan kw : keywordExtractor.getNames(sentence)) {
+            for (WordSpan kw : keywordExtractor.getNouns(sentence)) {
                if (kw.end + 2 >= sentence.length()) {
                    continue;
                }
@ -46,20 +48,46 @@ public class SubjectCounter {

                    String stemmed = rep.stemmed;

-                    counts.merge(stemmed, -1, Integer::sum);
                    instances.computeIfAbsent(stemmed, s -> new HashSet<>()).add(rep);
                }
            }
        }

-        int best = counts.values().stream().mapToInt(Integer::valueOf).min().orElse(0);
+        Map<String, Integer> scores = new HashMap<>(instances.size());
+        for (String stemmed : instances.keySet()) {
+            scores.put(stemmed, getTermTfIdf(keywordMetadata, stemmed));
+        }

-        return counts.entrySet().stream().sorted(Map.Entry.comparingByValue())
-                .filter(e -> e.getValue()<-2 && e.getValue()<=best*0.75)
+        return scores.entrySet().stream()
+                .filter(e -> e.getValue() >= 150)
                .flatMap(e -> instances.getOrDefault(e.getKey(), Collections.emptySet()).stream())
                .collect(Collectors.toList());
    }

+    private int getTermTfIdf(KeywordMetadata keywordMetadata, String stemmed) {
+        if (stemmed.contains("_")) {
+            int sum = 0;
+            String[] parts = StringUtils.split(stemmed, '_');
+
+            if (parts.length == 0) {
+                return  0;
+            }
+
+            for (String part : parts) {
+                sum += getTermTfIdf(keywordMetadata, part);
+            }
+
+            return sum / parts.length;
+        }
+
+        var meta = keywordMetadata.wordsTfIdf().get(stemmed);
+        if (meta != null) {
+            return meta.tfIdfNormalized();
+        }
+
+        return 0;
+    }
+
    private boolean isDetOrAdverbOrVerb(String posTag) {
        return "DT".equals(posTag) // determinant
                || "RB".equals(posTag) // adverb