From 443cf0cf1e70b73861bf9ff346eccb769ff1ff49 Mon Sep 17 00:00:00 2001 From: Viktor Lofgren Date: Mon, 12 Jun 2023 17:42:31 +0200 Subject: [PATCH] Expose additional functionality through WordsTfIdfCounts. Bump requirements for being flagged as high TF-IDF from 2 occurences to 3. --- .../keyword/extractors/WordsTfIdfCounts.java | 13 ++++++++++--- 1 file changed, 10 insertions(+), 3 deletions(-) diff --git a/code/features-convert/keyword-extraction/src/main/java/nu/marginalia/keyword/extractors/WordsTfIdfCounts.java b/code/features-convert/keyword-extraction/src/main/java/nu/marginalia/keyword/extractors/WordsTfIdfCounts.java index 7b8be9d2..e017061e 100644 --- a/code/features-convert/keyword-extraction/src/main/java/nu/marginalia/keyword/extractors/WordsTfIdfCounts.java +++ b/code/features-convert/keyword-extraction/src/main/java/nu/marginalia/keyword/extractors/WordsTfIdfCounts.java @@ -2,7 +2,6 @@ package nu.marginalia.keyword.extractors; import it.unimi.dsi.fastutil.objects.Object2IntOpenHashMap; import nu.marginalia.keyword.WordReps; -import nu.marginalia.language.WordPatterns; import nu.marginalia.language.model.DocumentLanguageData; import nu.marginalia.language.model.DocumentSentence; import nu.marginalia.language.model.WordRep; @@ -18,7 +17,7 @@ import static java.lang.Math.max; /** Extract counts and TF-IDF for the words in the document, * keep track of high-scoring words for flagging */ -public class WordsTfIdfCounts implements WordReps { +public class WordsTfIdfCounts implements WordReps, Comparator { private final TermFrequencyDict dict; private final double docCount; @@ -41,7 +40,7 @@ public class WordsTfIdfCounts implements WordReps { int value = getTermValue(key, cnt, maxVal); tfIdf.put(key, value); - if (cnt > 1 && value > 100) { + if (cnt > 2 && value > 100) { highTfIdfInstances.add(key); } }); @@ -74,6 +73,10 @@ public class WordsTfIdfCounts implements WordReps { return counts; } + public long termFrequencyDictValue(WordRep rep) { + return dict.getTermFreqStemmed(rep.stemmed); + } + private String spanToStemmed(DocumentSentence sentence, WordSpan span) { if (span.size() == 1) return sentence.stemmedWords[span.start]; @@ -133,4 +136,8 @@ public class WordsTfIdfCounts implements WordReps { return (0.1 + 0.9*value/maxValue) * Math.log(freq/docCount); } + @Override + public int compare(WordRep o1, WordRep o2) { + return tfIdf.getOrDefault(o1, 0) - tfIdf.getOrDefault(o2, 0); + } }