mirror of
https://github.com/MarginaliaSearch/MarginaliaSearch.git
synced 2025-02-23 21:18:58 +00:00
Clean up DocumentKeywordExtractor and DocumentKeywordsBuilder
This commit is contained in:
parent
04f501b8c8
commit
bd84c73e05
@ -12,7 +12,14 @@ import java.util.function.UnaryOperator;
|
||||
public class DocumentKeywordsBuilder {
|
||||
public final ArrayList<String> words = new ArrayList<>();
|
||||
public final TLongArrayList metadata = new TLongArrayList();
|
||||
|
||||
// |------64 letters is this long-------------------------------|
|
||||
// granted, some of these words are word n-grams, but 64 ought to
|
||||
// be plenty. The lexicon writer has another limit that's higher.
|
||||
private final int MAX_WORD_LENGTH = 64;
|
||||
|
||||
public DocumentKeywordsBuilder() {
|
||||
this(1600);
|
||||
}
|
||||
|
||||
public DocumentKeywords build() {
|
||||
@ -24,47 +31,22 @@ public class DocumentKeywordsBuilder {
|
||||
metadata.ensureCapacity(cacpacity);
|
||||
}
|
||||
|
||||
public DocumentKeywordsBuilder(Collection<Entry> initial) {
|
||||
public void add(String word, long meta) {
|
||||
if (word.length() > MAX_WORD_LENGTH)
|
||||
return;
|
||||
|
||||
words.ensureCapacity(initial.size());
|
||||
metadata.ensureCapacity(initial.size());
|
||||
for (var entry : initial) {
|
||||
words.add(entry.word);
|
||||
metadata.add(entry.metadata);
|
||||
}
|
||||
}
|
||||
|
||||
public static DocumentKeywordsBuilder withBlankMetadata(List<String> entries) {
|
||||
List<Long> emptyMeta = new ArrayList<>(entries.size());
|
||||
|
||||
for (int i = 0; i < entries.size(); i++) {
|
||||
emptyMeta.add(0L);
|
||||
}
|
||||
|
||||
return new DocumentKeywordsBuilder(entries, emptyMeta);
|
||||
words.add(word);
|
||||
metadata.add(meta);
|
||||
}
|
||||
|
||||
public void addJustNoMeta(String word) {
|
||||
if (word.length() > MAX_WORD_LENGTH)
|
||||
return;
|
||||
|
||||
words.add(word);
|
||||
metadata.add(0);
|
||||
}
|
||||
|
||||
private DocumentKeywordsBuilder(List<String> words, List<Long> meta) {
|
||||
|
||||
this.words.addAll(words);
|
||||
this.metadata.addAll(meta);
|
||||
}
|
||||
|
||||
public void addAll(Collection<Entry> newWords) {
|
||||
words.ensureCapacity(words.size() + newWords.size());
|
||||
metadata.ensureCapacity(metadata.size() + newWords.size());
|
||||
|
||||
for (var entry : newWords) {
|
||||
words.add(entry.word);
|
||||
metadata.add(entry.metadata);
|
||||
}
|
||||
}
|
||||
|
||||
public void setFlagOnMetadataForWords(EdgePageWordFlags flag, Set<String> flagWords) {
|
||||
if (flagWords.isEmpty())
|
||||
return;
|
||||
@ -100,11 +82,6 @@ public class DocumentKeywordsBuilder {
|
||||
return ret;
|
||||
}
|
||||
|
||||
public void add(String word, long meta) {
|
||||
words.add(word);
|
||||
metadata.add(meta);
|
||||
}
|
||||
|
||||
public int size() {
|
||||
return words.size();
|
||||
}
|
||||
@ -113,6 +90,4 @@ public class DocumentKeywordsBuilder {
|
||||
words.replaceAll(internalizer);
|
||||
}
|
||||
|
||||
public record Entry(String word, long metadata) {
|
||||
}
|
||||
}
|
||||
|
@ -224,7 +224,7 @@ public class DocumentKeywordExtractor {
|
||||
}
|
||||
|
||||
private static class FilteringDocumentKeywordsBuilder {
|
||||
private final DocumentKeywordsBuilder words = new DocumentKeywordsBuilder(1600);
|
||||
private final DocumentKeywordsBuilder words = new DocumentKeywordsBuilder();
|
||||
private final Set<String> seen = new HashSet<>(1600);
|
||||
|
||||
public void add(String word, long meta) {
|
||||
|
Loading…
Reference in New Issue
Block a user