Clean up KeywordMetadata

This commit is contained in:
Viktor Lofgren 2023-01-30 10:22:43 +01:00
parent d5df3268b3
commit 2e4532ca90
2 changed files with 5 additions and 6 deletions

View File

@ -154,7 +154,7 @@ public class DocumentKeywordExtractor {
if (!word.isStopWord()) { if (!word.isStopWord()) {
String w = AsciiFlattener.flattenUnicode(word.wordLowerCase()); String w = AsciiFlattener.flattenUnicode(word.wordLowerCase());
if (WordPatterns.singleWordQualitiesPredicate.test(w)) { if (WordPatterns.singleWordQualitiesPredicate.test(w)) {
wordsBuilder.add(w, metadata.forWord(flagsTemplate, word.stemmed())); wordsBuilder.add(w, metadata.getMetadataForWord(flagsTemplate, word.stemmed()));
} }
} }
} }
@ -163,7 +163,7 @@ public class DocumentKeywordExtractor {
var rep = new WordRep(sent, names); var rep = new WordRep(sent, names);
String w = AsciiFlattener.flattenUnicode(rep.word); String w = AsciiFlattener.flattenUnicode(rep.word);
wordsBuilder.add(w, metadata.forWord(flagsTemplate, rep.stemmed)); wordsBuilder.add(w, metadata.getMetadataForWord(flagsTemplate, rep.stemmed));
} }
} }
@ -217,7 +217,7 @@ public class DocumentKeywordExtractor {
continue; continue;
} }
wordsBuilder.add(flatWord, metadata.forWord(metadata.wordFlagsTemplate(), word.stemmed) | additionalMeta); wordsBuilder.add(flatWord, metadata.getMetadataForWord(metadata.wordFlagsTemplate(), word.stemmed) | additionalMeta);
} }
} }

View File

@ -17,8 +17,6 @@ public record KeywordMetadata(HashSet<String> titleKeywords,
) )
{ {
private static final KeywordCounter.WordFrequencyData empty = new KeywordCounter.WordFrequencyData(0, 0);
public KeywordMetadata(EnumSet<EdgePageWordFlags> flags) { public KeywordMetadata(EnumSet<EdgePageWordFlags> flags) {
this(new HashSet<>(50), new HashSet<>(10), new HashSet<>(50), this(new HashSet<>(50), new HashSet<>(10), new HashSet<>(50),
new HashMap<>(15_000), new HashMap<>(15_000),
@ -30,7 +28,8 @@ public record KeywordMetadata(HashSet<String> titleKeywords,
this(EnumSet.noneOf(EdgePageWordFlags.class)); this(EnumSet.noneOf(EdgePageWordFlags.class));
} }
public long forWord(EnumSet<EdgePageWordFlags> flagsTemplate, String stemmed) { private static final KeywordCounter.WordFrequencyData empty = new KeywordCounter.WordFrequencyData(0, 0);
public long getMetadataForWord(EnumSet<EdgePageWordFlags> flagsTemplate, String stemmed) {
KeywordCounter.WordFrequencyData tfidf = wordsTfIdf.getOrDefault(stemmed, empty); KeywordCounter.WordFrequencyData tfidf = wordsTfIdf.getOrDefault(stemmed, empty);
EnumSet<EdgePageWordFlags> flags = flagsTemplate.clone(); EnumSet<EdgePageWordFlags> flags = flagsTemplate.clone();