mirror of
https://github.com/MarginaliaSearch/MarginaliaSearch.git
synced 2025-02-24 05:18:58 +00:00
Clean up KeywordMetadata
This commit is contained in:
parent
d5df3268b3
commit
2e4532ca90
@ -154,7 +154,7 @@ public class DocumentKeywordExtractor {
|
|||||||
if (!word.isStopWord()) {
|
if (!word.isStopWord()) {
|
||||||
String w = AsciiFlattener.flattenUnicode(word.wordLowerCase());
|
String w = AsciiFlattener.flattenUnicode(word.wordLowerCase());
|
||||||
if (WordPatterns.singleWordQualitiesPredicate.test(w)) {
|
if (WordPatterns.singleWordQualitiesPredicate.test(w)) {
|
||||||
wordsBuilder.add(w, metadata.forWord(flagsTemplate, word.stemmed()));
|
wordsBuilder.add(w, metadata.getMetadataForWord(flagsTemplate, word.stemmed()));
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
@ -163,7 +163,7 @@ public class DocumentKeywordExtractor {
|
|||||||
var rep = new WordRep(sent, names);
|
var rep = new WordRep(sent, names);
|
||||||
String w = AsciiFlattener.flattenUnicode(rep.word);
|
String w = AsciiFlattener.flattenUnicode(rep.word);
|
||||||
|
|
||||||
wordsBuilder.add(w, metadata.forWord(flagsTemplate, rep.stemmed));
|
wordsBuilder.add(w, metadata.getMetadataForWord(flagsTemplate, rep.stemmed));
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -217,7 +217,7 @@ public class DocumentKeywordExtractor {
|
|||||||
continue;
|
continue;
|
||||||
}
|
}
|
||||||
|
|
||||||
wordsBuilder.add(flatWord, metadata.forWord(metadata.wordFlagsTemplate(), word.stemmed) | additionalMeta);
|
wordsBuilder.add(flatWord, metadata.getMetadataForWord(metadata.wordFlagsTemplate(), word.stemmed) | additionalMeta);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -17,8 +17,6 @@ public record KeywordMetadata(HashSet<String> titleKeywords,
|
|||||||
)
|
)
|
||||||
{
|
{
|
||||||
|
|
||||||
private static final KeywordCounter.WordFrequencyData empty = new KeywordCounter.WordFrequencyData(0, 0);
|
|
||||||
|
|
||||||
public KeywordMetadata(EnumSet<EdgePageWordFlags> flags) {
|
public KeywordMetadata(EnumSet<EdgePageWordFlags> flags) {
|
||||||
this(new HashSet<>(50), new HashSet<>(10), new HashSet<>(50),
|
this(new HashSet<>(50), new HashSet<>(10), new HashSet<>(50),
|
||||||
new HashMap<>(15_000),
|
new HashMap<>(15_000),
|
||||||
@ -30,7 +28,8 @@ public record KeywordMetadata(HashSet<String> titleKeywords,
|
|||||||
this(EnumSet.noneOf(EdgePageWordFlags.class));
|
this(EnumSet.noneOf(EdgePageWordFlags.class));
|
||||||
}
|
}
|
||||||
|
|
||||||
public long forWord(EnumSet<EdgePageWordFlags> flagsTemplate, String stemmed) {
|
private static final KeywordCounter.WordFrequencyData empty = new KeywordCounter.WordFrequencyData(0, 0);
|
||||||
|
public long getMetadataForWord(EnumSet<EdgePageWordFlags> flagsTemplate, String stemmed) {
|
||||||
|
|
||||||
KeywordCounter.WordFrequencyData tfidf = wordsTfIdf.getOrDefault(stemmed, empty);
|
KeywordCounter.WordFrequencyData tfidf = wordsTfIdf.getOrDefault(stemmed, empty);
|
||||||
EnumSet<EdgePageWordFlags> flags = flagsTemplate.clone();
|
EnumSet<EdgePageWordFlags> flags = flagsTemplate.clone();
|
||||||
|
Loading…
Reference in New Issue
Block a user