Clean up DocumentKeywordExtractor and DocumentKeywordsBuilder

This commit is contained in:
Viktor Lofgren 2023-03-07 16:36:12 +01:00
parent 04f501b8c8
commit bd84c73e05
2 changed files with 16 additions and 41 deletions

View File

@ -12,7 +12,14 @@ import java.util.function.UnaryOperator;
public class DocumentKeywordsBuilder {
public final ArrayList<String> words = new ArrayList<>();
public final TLongArrayList metadata = new TLongArrayList();
// |------64 letters is this long-------------------------------|
// granted, some of these words are word n-grams, but 64 ought to
// be plenty. The lexicon writer has another limit that's higher.
private final int MAX_WORD_LENGTH = 64;
public DocumentKeywordsBuilder() {
this(1600);
}
public DocumentKeywords build() {
@ -24,47 +31,22 @@ public class DocumentKeywordsBuilder {
metadata.ensureCapacity(cacpacity);
}
public DocumentKeywordsBuilder(Collection<Entry> initial) {
public void add(String word, long meta) {
if (word.length() > MAX_WORD_LENGTH)
return;
words.ensureCapacity(initial.size());
metadata.ensureCapacity(initial.size());
for (var entry : initial) {
words.add(entry.word);
metadata.add(entry.metadata);
}
}
public static DocumentKeywordsBuilder withBlankMetadata(List<String> entries) {
List<Long> emptyMeta = new ArrayList<>(entries.size());
for (int i = 0; i < entries.size(); i++) {
emptyMeta.add(0L);
}
return new DocumentKeywordsBuilder(entries, emptyMeta);
words.add(word);
metadata.add(meta);
}
public void addJustNoMeta(String word) {
if (word.length() > MAX_WORD_LENGTH)
return;
words.add(word);
metadata.add(0);
}
private DocumentKeywordsBuilder(List<String> words, List<Long> meta) {
this.words.addAll(words);
this.metadata.addAll(meta);
}
public void addAll(Collection<Entry> newWords) {
words.ensureCapacity(words.size() + newWords.size());
metadata.ensureCapacity(metadata.size() + newWords.size());
for (var entry : newWords) {
words.add(entry.word);
metadata.add(entry.metadata);
}
}
public void setFlagOnMetadataForWords(EdgePageWordFlags flag, Set<String> flagWords) {
if (flagWords.isEmpty())
return;
@ -100,11 +82,6 @@ public class DocumentKeywordsBuilder {
return ret;
}
public void add(String word, long meta) {
words.add(word);
metadata.add(meta);
}
public int size() {
return words.size();
}
@ -113,6 +90,4 @@ public class DocumentKeywordsBuilder {
words.replaceAll(internalizer);
}
public record Entry(String word, long metadata) {
}
}

View File

@ -224,7 +224,7 @@ public class DocumentKeywordExtractor {
}
private static class FilteringDocumentKeywordsBuilder {
private final DocumentKeywordsBuilder words = new DocumentKeywordsBuilder(1600);
private final DocumentKeywordsBuilder words = new DocumentKeywordsBuilder();
private final Set<String> seen = new HashSet<>(1600);
public void add(String word, long meta) {