mirror of
https://github.com/MarginaliaSearch/MarginaliaSearch.git
synced 2025-02-24 05:18:58 +00:00
String deduplication
This commit is contained in:
parent
0fd21b9cbf
commit
4d0b444703
@ -1,20 +1,33 @@
|
||||
package nu.marginalia.util;
|
||||
|
||||
import it.unimi.dsi.fastutil.objects.Object2LongOpenHashMap;
|
||||
|
||||
import java.util.Arrays;
|
||||
import java.util.HashMap;
|
||||
|
||||
public class StringPool {
|
||||
private final HashMap<String, String> words;
|
||||
|
||||
public StringPool() {
|
||||
this.words = new HashMap<>(1000);
|
||||
private final HashMap<String, String> words;
|
||||
private final Object2LongOpenHashMap<String> ages;
|
||||
private final int maxCap;
|
||||
|
||||
long idx;
|
||||
|
||||
private StringPool(int capacity, int maxCap) {
|
||||
this.ages = new Object2LongOpenHashMap<>(capacity);
|
||||
this.words = new HashMap<>(capacity);
|
||||
this.maxCap = maxCap;
|
||||
}
|
||||
|
||||
public StringPool(int capacity) {
|
||||
words = new HashMap<>(capacity);
|
||||
public static StringPool create(int capacity) {
|
||||
return new StringPool(capacity, capacity * 10);
|
||||
}
|
||||
|
||||
public String internalize(String str) {
|
||||
prune();
|
||||
|
||||
final String ret = words.putIfAbsent(str, str);
|
||||
ages.put(ret, idx++);
|
||||
|
||||
if (null == ret)
|
||||
return str;
|
||||
@ -22,6 +35,37 @@ public class StringPool {
|
||||
return ret;
|
||||
}
|
||||
|
||||
public String[] internalize(String[] str) {
|
||||
|
||||
for (int i = 0; i < str.length; i++) {
|
||||
str[i] = internalize(str[i]);
|
||||
}
|
||||
|
||||
return str;
|
||||
}
|
||||
|
||||
public void prune() {
|
||||
|
||||
if (words.size() < maxCap)
|
||||
return;
|
||||
|
||||
long[] ageValues = ages.values().toLongArray();
|
||||
Arrays.sort(ageValues);
|
||||
|
||||
long cutoff = ageValues[ageValues.length - maxCap / 10];
|
||||
|
||||
words.clear();
|
||||
ages.forEach((word, cnt) -> {
|
||||
if (cnt >= cutoff) {
|
||||
words.put(word, word);
|
||||
}
|
||||
});
|
||||
ages.clear();
|
||||
words.forEach((w,w2) -> {
|
||||
ages.put(w, idx);
|
||||
});
|
||||
}
|
||||
|
||||
public void flush() {
|
||||
words.clear();
|
||||
}
|
||||
|
@ -42,7 +42,7 @@ public class DomainProcessor {
|
||||
|
||||
fixBadCanonicalTags(crawledDomain.doc);
|
||||
|
||||
StringPool stringPool = new StringPool(1000 + 100 * crawledDomain.doc.size());
|
||||
StringPool stringPool = StringPool.create(1000 + 100 * crawledDomain.doc.size());
|
||||
|
||||
for (var doc : crawledDomain.doc) {
|
||||
var processedDoc = documentProcessor.process(doc, crawledDomain);
|
||||
|
Loading…
Reference in New Issue
Block a user