Clean up KeywordMetadata

This commit is contained in:
Viktor Lofgren 2023-01-30 10:22:43 +01:00
parent d5df3268b3
commit 2e4532ca90
2 changed files with 5 additions and 6 deletions

View File

@ -154,7 +154,7 @@ public class DocumentKeywordExtractor {
if (!word.isStopWord()) {
String w = AsciiFlattener.flattenUnicode(word.wordLowerCase());
if (WordPatterns.singleWordQualitiesPredicate.test(w)) {
wordsBuilder.add(w, metadata.forWord(flagsTemplate, word.stemmed()));
wordsBuilder.add(w, metadata.getMetadataForWord(flagsTemplate, word.stemmed()));
}
}
}
@ -163,7 +163,7 @@ public class DocumentKeywordExtractor {
var rep = new WordRep(sent, names);
String w = AsciiFlattener.flattenUnicode(rep.word);
wordsBuilder.add(w, metadata.forWord(flagsTemplate, rep.stemmed));
wordsBuilder.add(w, metadata.getMetadataForWord(flagsTemplate, rep.stemmed));
}
}
@ -217,7 +217,7 @@ public class DocumentKeywordExtractor {
continue;
}
wordsBuilder.add(flatWord, metadata.forWord(metadata.wordFlagsTemplate(), word.stemmed) | additionalMeta);
wordsBuilder.add(flatWord, metadata.getMetadataForWord(metadata.wordFlagsTemplate(), word.stemmed) | additionalMeta);
}
}

View File

@ -17,8 +17,6 @@ public record KeywordMetadata(HashSet<String> titleKeywords,
)
{
private static final KeywordCounter.WordFrequencyData empty = new KeywordCounter.WordFrequencyData(0, 0);
public KeywordMetadata(EnumSet<EdgePageWordFlags> flags) {
this(new HashSet<>(50), new HashSet<>(10), new HashSet<>(50),
new HashMap<>(15_000),
@ -30,7 +28,8 @@ public record KeywordMetadata(HashSet<String> titleKeywords,
this(EnumSet.noneOf(EdgePageWordFlags.class));
}
public long forWord(EnumSet<EdgePageWordFlags> flagsTemplate, String stemmed) {
private static final KeywordCounter.WordFrequencyData empty = new KeywordCounter.WordFrequencyData(0, 0);
public long getMetadataForWord(EnumSet<EdgePageWordFlags> flagsTemplate, String stemmed) {
KeywordCounter.WordFrequencyData tfidf = wordsTfIdf.getOrDefault(stemmed, empty);
EnumSet<EdgePageWordFlags> flags = flagsTemplate.clone();