Clean up DocumentKeywordExtractor and DocumentKeywordsBuilder

This commit is contained in:
Viktor Lofgren 2023-03-07 16:36:12 +01:00
parent 04f501b8c8
commit bd84c73e05
2 changed files with 16 additions and 41 deletions

View File

@ -12,7 +12,14 @@ import java.util.function.UnaryOperator;
public class DocumentKeywordsBuilder { public class DocumentKeywordsBuilder {
public final ArrayList<String> words = new ArrayList<>(); public final ArrayList<String> words = new ArrayList<>();
public final TLongArrayList metadata = new TLongArrayList(); public final TLongArrayList metadata = new TLongArrayList();
// |------64 letters is this long-------------------------------|
// granted, some of these words are word n-grams, but 64 ought to
// be plenty. The lexicon writer has another limit that's higher.
private final int MAX_WORD_LENGTH = 64;
public DocumentKeywordsBuilder() { public DocumentKeywordsBuilder() {
this(1600);
} }
public DocumentKeywords build() { public DocumentKeywords build() {
@ -24,47 +31,22 @@ public class DocumentKeywordsBuilder {
metadata.ensureCapacity(cacpacity); metadata.ensureCapacity(cacpacity);
} }
public DocumentKeywordsBuilder(Collection<Entry> initial) { public void add(String word, long meta) {
if (word.length() > MAX_WORD_LENGTH)
return;
words.ensureCapacity(initial.size()); words.add(word);
metadata.ensureCapacity(initial.size()); metadata.add(meta);
for (var entry : initial) {
words.add(entry.word);
metadata.add(entry.metadata);
}
}
public static DocumentKeywordsBuilder withBlankMetadata(List<String> entries) {
List<Long> emptyMeta = new ArrayList<>(entries.size());
for (int i = 0; i < entries.size(); i++) {
emptyMeta.add(0L);
}
return new DocumentKeywordsBuilder(entries, emptyMeta);
} }
public void addJustNoMeta(String word) { public void addJustNoMeta(String word) {
if (word.length() > MAX_WORD_LENGTH)
return;
words.add(word); words.add(word);
metadata.add(0); metadata.add(0);
} }
private DocumentKeywordsBuilder(List<String> words, List<Long> meta) {
this.words.addAll(words);
this.metadata.addAll(meta);
}
public void addAll(Collection<Entry> newWords) {
words.ensureCapacity(words.size() + newWords.size());
metadata.ensureCapacity(metadata.size() + newWords.size());
for (var entry : newWords) {
words.add(entry.word);
metadata.add(entry.metadata);
}
}
public void setFlagOnMetadataForWords(EdgePageWordFlags flag, Set<String> flagWords) { public void setFlagOnMetadataForWords(EdgePageWordFlags flag, Set<String> flagWords) {
if (flagWords.isEmpty()) if (flagWords.isEmpty())
return; return;
@ -100,11 +82,6 @@ public class DocumentKeywordsBuilder {
return ret; return ret;
} }
public void add(String word, long meta) {
words.add(word);
metadata.add(meta);
}
public int size() { public int size() {
return words.size(); return words.size();
} }
@ -113,6 +90,4 @@ public class DocumentKeywordsBuilder {
words.replaceAll(internalizer); words.replaceAll(internalizer);
} }
public record Entry(String word, long metadata) {
}
} }

View File

@ -224,7 +224,7 @@ public class DocumentKeywordExtractor {
} }
private static class FilteringDocumentKeywordsBuilder { private static class FilteringDocumentKeywordsBuilder {
private final DocumentKeywordsBuilder words = new DocumentKeywordsBuilder(1600); private final DocumentKeywordsBuilder words = new DocumentKeywordsBuilder();
private final Set<String> seen = new HashSet<>(1600); private final Set<String> seen = new HashSet<>(1600);
public void add(String word, long meta) { public void add(String word, long meta) {