diff --git a/marginalia_nu/src/main/java/nu/marginalia/util/StringPool.java b/marginalia_nu/src/main/java/nu/marginalia/util/StringPool.java index 5911a497..7ab9b651 100644 --- a/marginalia_nu/src/main/java/nu/marginalia/util/StringPool.java +++ b/marginalia_nu/src/main/java/nu/marginalia/util/StringPool.java @@ -1,20 +1,33 @@ package nu.marginalia.util; +import it.unimi.dsi.fastutil.objects.Object2LongOpenHashMap; + +import java.util.Arrays; import java.util.HashMap; public class StringPool { - private final HashMap words; - public StringPool() { - this.words = new HashMap<>(1000); + private final HashMap words; + private final Object2LongOpenHashMap ages; + private final int maxCap; + + long idx; + + private StringPool(int capacity, int maxCap) { + this.ages = new Object2LongOpenHashMap<>(capacity); + this.words = new HashMap<>(capacity); + this.maxCap = maxCap; } - public StringPool(int capacity) { - words = new HashMap<>(capacity); + public static StringPool create(int capacity) { + return new StringPool(capacity, capacity * 10); } public String internalize(String str) { + prune(); + final String ret = words.putIfAbsent(str, str); + ages.put(ret, idx++); if (null == ret) return str; @@ -22,6 +35,37 @@ public class StringPool { return ret; } + public String[] internalize(String[] str) { + + for (int i = 0; i < str.length; i++) { + str[i] = internalize(str[i]); + } + + return str; + } + + public void prune() { + + if (words.size() < maxCap) + return; + + long[] ageValues = ages.values().toLongArray(); + Arrays.sort(ageValues); + + long cutoff = ageValues[ageValues.length - maxCap / 10]; + + words.clear(); + ages.forEach((word, cnt) -> { + if (cnt >= cutoff) { + words.put(word, word); + } + }); + ages.clear(); + words.forEach((w,w2) -> { + ages.put(w, idx); + }); + } + public void flush() { words.clear(); } diff --git a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/converting/processor/DomainProcessor.java b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/converting/processor/DomainProcessor.java index d9ff7ef1..e5ed00e5 100644 --- a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/converting/processor/DomainProcessor.java +++ b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/converting/processor/DomainProcessor.java @@ -42,7 +42,7 @@ public class DomainProcessor { fixBadCanonicalTags(crawledDomain.doc); - StringPool stringPool = new StringPool(1000 + 100 * crawledDomain.doc.size()); + StringPool stringPool = StringPool.create(1000 + 100 * crawledDomain.doc.size()); for (var doc : crawledDomain.doc) { var processedDoc = documentProcessor.process(doc, crawledDomain);