mirror of
https://github.com/MarginaliaSearch/MarginaliaSearch.git
synced 2025-02-24 05:18:58 +00:00
Clean up DocumentKeywordExtractor and DocumentKeywordsBuilder
This commit is contained in:
parent
04f501b8c8
commit
bd84c73e05
@ -12,7 +12,14 @@ import java.util.function.UnaryOperator;
|
|||||||
public class DocumentKeywordsBuilder {
|
public class DocumentKeywordsBuilder {
|
||||||
public final ArrayList<String> words = new ArrayList<>();
|
public final ArrayList<String> words = new ArrayList<>();
|
||||||
public final TLongArrayList metadata = new TLongArrayList();
|
public final TLongArrayList metadata = new TLongArrayList();
|
||||||
|
|
||||||
|
// |------64 letters is this long-------------------------------|
|
||||||
|
// granted, some of these words are word n-grams, but 64 ought to
|
||||||
|
// be plenty. The lexicon writer has another limit that's higher.
|
||||||
|
private final int MAX_WORD_LENGTH = 64;
|
||||||
|
|
||||||
public DocumentKeywordsBuilder() {
|
public DocumentKeywordsBuilder() {
|
||||||
|
this(1600);
|
||||||
}
|
}
|
||||||
|
|
||||||
public DocumentKeywords build() {
|
public DocumentKeywords build() {
|
||||||
@ -24,47 +31,22 @@ public class DocumentKeywordsBuilder {
|
|||||||
metadata.ensureCapacity(cacpacity);
|
metadata.ensureCapacity(cacpacity);
|
||||||
}
|
}
|
||||||
|
|
||||||
public DocumentKeywordsBuilder(Collection<Entry> initial) {
|
public void add(String word, long meta) {
|
||||||
|
if (word.length() > MAX_WORD_LENGTH)
|
||||||
|
return;
|
||||||
|
|
||||||
words.ensureCapacity(initial.size());
|
words.add(word);
|
||||||
metadata.ensureCapacity(initial.size());
|
metadata.add(meta);
|
||||||
for (var entry : initial) {
|
|
||||||
words.add(entry.word);
|
|
||||||
metadata.add(entry.metadata);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
public static DocumentKeywordsBuilder withBlankMetadata(List<String> entries) {
|
|
||||||
List<Long> emptyMeta = new ArrayList<>(entries.size());
|
|
||||||
|
|
||||||
for (int i = 0; i < entries.size(); i++) {
|
|
||||||
emptyMeta.add(0L);
|
|
||||||
}
|
|
||||||
|
|
||||||
return new DocumentKeywordsBuilder(entries, emptyMeta);
|
|
||||||
}
|
}
|
||||||
|
|
||||||
public void addJustNoMeta(String word) {
|
public void addJustNoMeta(String word) {
|
||||||
|
if (word.length() > MAX_WORD_LENGTH)
|
||||||
|
return;
|
||||||
|
|
||||||
words.add(word);
|
words.add(word);
|
||||||
metadata.add(0);
|
metadata.add(0);
|
||||||
}
|
}
|
||||||
|
|
||||||
private DocumentKeywordsBuilder(List<String> words, List<Long> meta) {
|
|
||||||
|
|
||||||
this.words.addAll(words);
|
|
||||||
this.metadata.addAll(meta);
|
|
||||||
}
|
|
||||||
|
|
||||||
public void addAll(Collection<Entry> newWords) {
|
|
||||||
words.ensureCapacity(words.size() + newWords.size());
|
|
||||||
metadata.ensureCapacity(metadata.size() + newWords.size());
|
|
||||||
|
|
||||||
for (var entry : newWords) {
|
|
||||||
words.add(entry.word);
|
|
||||||
metadata.add(entry.metadata);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
public void setFlagOnMetadataForWords(EdgePageWordFlags flag, Set<String> flagWords) {
|
public void setFlagOnMetadataForWords(EdgePageWordFlags flag, Set<String> flagWords) {
|
||||||
if (flagWords.isEmpty())
|
if (flagWords.isEmpty())
|
||||||
return;
|
return;
|
||||||
@ -100,11 +82,6 @@ public class DocumentKeywordsBuilder {
|
|||||||
return ret;
|
return ret;
|
||||||
}
|
}
|
||||||
|
|
||||||
public void add(String word, long meta) {
|
|
||||||
words.add(word);
|
|
||||||
metadata.add(meta);
|
|
||||||
}
|
|
||||||
|
|
||||||
public int size() {
|
public int size() {
|
||||||
return words.size();
|
return words.size();
|
||||||
}
|
}
|
||||||
@ -113,6 +90,4 @@ public class DocumentKeywordsBuilder {
|
|||||||
words.replaceAll(internalizer);
|
words.replaceAll(internalizer);
|
||||||
}
|
}
|
||||||
|
|
||||||
public record Entry(String word, long metadata) {
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
|
@ -224,7 +224,7 @@ public class DocumentKeywordExtractor {
|
|||||||
}
|
}
|
||||||
|
|
||||||
private static class FilteringDocumentKeywordsBuilder {
|
private static class FilteringDocumentKeywordsBuilder {
|
||||||
private final DocumentKeywordsBuilder words = new DocumentKeywordsBuilder(1600);
|
private final DocumentKeywordsBuilder words = new DocumentKeywordsBuilder();
|
||||||
private final Set<String> seen = new HashSet<>(1600);
|
private final Set<String> seen = new HashSet<>(1600);
|
||||||
|
|
||||||
public void add(String word, long meta) {
|
public void add(String word, long meta) {
|
||||||
|
Loading…
Reference in New Issue
Block a user