2023-01-08 10:11:44 +00:00
|
|
|
package nu.marginalia.util;
|
|
|
|
|
2023-01-30 08:40:29 +00:00
|
|
|
import it.unimi.dsi.fastutil.objects.Object2LongOpenHashMap;
|
|
|
|
|
|
|
|
import java.util.Arrays;
|
2023-01-08 10:11:44 +00:00
|
|
|
import java.util.HashMap;
|
2023-06-12 15:42:28 +00:00
|
|
|
import java.util.Objects;
|
2023-01-08 10:11:44 +00:00
|
|
|
|
|
|
|
public class StringPool {
|
2023-01-30 08:40:29 +00:00
|
|
|
|
2023-01-08 10:11:44 +00:00
|
|
|
private final HashMap<String, String> words;
|
2023-01-30 08:40:29 +00:00
|
|
|
private final Object2LongOpenHashMap<String> ages;
|
|
|
|
private final int maxCap;
|
2023-01-08 10:11:44 +00:00
|
|
|
|
2023-01-30 08:40:29 +00:00
|
|
|
long idx;
|
|
|
|
|
|
|
|
private StringPool(int capacity, int maxCap) {
|
|
|
|
this.ages = new Object2LongOpenHashMap<>(capacity);
|
|
|
|
this.words = new HashMap<>(capacity);
|
|
|
|
this.maxCap = maxCap;
|
2023-01-08 10:11:44 +00:00
|
|
|
}
|
|
|
|
|
2023-01-30 08:40:29 +00:00
|
|
|
public static StringPool create(int capacity) {
|
|
|
|
return new StringPool(capacity, capacity * 10);
|
2023-01-08 10:11:44 +00:00
|
|
|
}
|
|
|
|
|
|
|
|
public String internalize(String str) {
|
2023-01-30 08:40:29 +00:00
|
|
|
prune();
|
|
|
|
|
2023-01-08 10:11:44 +00:00
|
|
|
final String ret = words.putIfAbsent(str, str);
|
2023-01-30 08:40:29 +00:00
|
|
|
ages.put(ret, idx++);
|
2023-01-08 10:11:44 +00:00
|
|
|
|
2023-06-12 15:42:28 +00:00
|
|
|
return Objects.requireNonNullElse(ret, str);
|
2023-01-08 10:11:44 +00:00
|
|
|
}
|
|
|
|
|
2023-01-30 08:40:29 +00:00
|
|
|
public String[] internalize(String[] str) {
|
|
|
|
|
|
|
|
for (int i = 0; i < str.length; i++) {
|
|
|
|
str[i] = internalize(str[i]);
|
|
|
|
}
|
|
|
|
|
|
|
|
return str;
|
|
|
|
}
|
|
|
|
|
|
|
|
public void prune() {
|
|
|
|
|
|
|
|
if (words.size() < maxCap)
|
|
|
|
return;
|
|
|
|
|
|
|
|
long[] ageValues = ages.values().toLongArray();
|
|
|
|
Arrays.sort(ageValues);
|
|
|
|
|
|
|
|
long cutoff = ageValues[ageValues.length - maxCap / 10];
|
|
|
|
|
|
|
|
words.clear();
|
|
|
|
ages.forEach((word, cnt) -> {
|
|
|
|
if (cnt >= cutoff) {
|
|
|
|
words.put(word, word);
|
|
|
|
}
|
|
|
|
});
|
|
|
|
ages.clear();
|
|
|
|
words.forEach((w,w2) -> {
|
|
|
|
ages.put(w, idx);
|
|
|
|
});
|
|
|
|
}
|
|
|
|
|
2023-01-08 10:11:44 +00:00
|
|
|
public void flush() {
|
|
|
|
words.clear();
|
|
|
|
}
|
|
|
|
}
|