diff --git a/marginalia_nu/src/main/java/nu/marginalia/util/dict/DictionaryData.java b/marginalia_nu/src/main/java/nu/marginalia/util/dict/DictionaryData.java index 9e89b730..492417a0 100644 --- a/marginalia_nu/src/main/java/nu/marginalia/util/dict/DictionaryData.java +++ b/marginalia_nu/src/main/java/nu/marginalia/util/dict/DictionaryData.java @@ -1,8 +1,5 @@ package nu.marginalia.util.dict; -import org.slf4j.Logger; -import org.slf4j.LoggerFactory; - import java.nio.ByteBuffer; import java.nio.LongBuffer; import java.util.ArrayList; @@ -10,7 +7,6 @@ import java.util.ArrayList; public class DictionaryData { private final int DICTIONARY_BANK_SIZE; - private static final Logger logger = LoggerFactory.getLogger(DictionaryData.class); private final ArrayList banks = new ArrayList<>(100); diff --git a/marginalia_nu/src/main/java/nu/marginalia/util/dict/DictionaryMap.java b/marginalia_nu/src/main/java/nu/marginalia/util/dict/DictionaryMap.java index fad45130..fb13893e 100644 --- a/marginalia_nu/src/main/java/nu/marginalia/util/dict/DictionaryMap.java +++ b/marginalia_nu/src/main/java/nu/marginalia/util/dict/DictionaryMap.java @@ -1,6 +1,17 @@ package nu.marginalia.util.dict; public interface DictionaryMap { + int NO_VALUE = Integer.MIN_VALUE; + + static DictionaryMap create() { + if (Boolean.getBoolean("small-ram")) { + return new OnHeapDictionaryMap(); + } + else { + return new OffHeapDictionaryHashMap(1L << 31); + } + } + int size(); int put(long key); diff --git a/marginalia_nu/src/main/java/nu/marginalia/util/dict/DictionaryHashMap.java b/marginalia_nu/src/main/java/nu/marginalia/util/dict/OffHeapDictionaryHashMap.java similarity index 96% rename from marginalia_nu/src/main/java/nu/marginalia/util/dict/DictionaryHashMap.java rename to marginalia_nu/src/main/java/nu/marginalia/util/dict/OffHeapDictionaryHashMap.java index f66599d3..f906c45a 100644 --- a/marginalia_nu/src/main/java/nu/marginalia/util/dict/DictionaryHashMap.java +++ b/marginalia_nu/src/main/java/nu/marginalia/util/dict/OffHeapDictionaryHashMap.java @@ -16,15 +16,14 @@ import static nu.marginalia.util.FileSizeUtil.readableSize; * Spiritually influenced by GNU Trove's hash maps * LGPL 2.1 */ -public class DictionaryHashMap implements DictionaryMap { - private static final Logger logger = LoggerFactory.getLogger(DictionaryHashMap.class); +public class OffHeapDictionaryHashMap implements DictionaryMap { + private static final Logger logger = LoggerFactory.getLogger(OffHeapDictionaryHashMap.class); private static final Gauge probe_count_metrics = Gauge.build("wmsa_dictionary_hash_map_probe_count", "Probing Count") .register(); private final int bufferCount; private final IntBuffer[] buffers; - public static final int NO_VALUE = Integer.MIN_VALUE; private final DictionaryData dictionaryData; @@ -35,7 +34,7 @@ public class DictionaryHashMap implements DictionaryMap { private final AtomicInteger sz = new AtomicInteger(0); - public DictionaryHashMap(long sizeMemory) { + public OffHeapDictionaryHashMap(long sizeMemory) { final int intSize = 4; bufferCount = 1 + (int) ((intSize*sizeMemory) / (1<<30)); diff --git a/marginalia_nu/src/main/java/nu/marginalia/util/dict/OnHeapDictionaryMap.java b/marginalia_nu/src/main/java/nu/marginalia/util/dict/OnHeapDictionaryMap.java new file mode 100644 index 00000000..a9f4063f --- /dev/null +++ b/marginalia_nu/src/main/java/nu/marginalia/util/dict/OnHeapDictionaryMap.java @@ -0,0 +1,23 @@ +package nu.marginalia.util.dict; + +import it.unimi.dsi.fastutil.longs.Long2IntOpenHashMap; + +public class OnHeapDictionaryMap implements DictionaryMap { + private final Long2IntOpenHashMap entries = new Long2IntOpenHashMap(100_000, 0.75f); + + @Override + public int size() { + return entries.size(); + } + + @Override + public int put(long key) { + entries.putIfAbsent(key, entries.size()); + return get(key); + } + + @Override + public int get(long key) { + return entries.getOrDefault(key, NO_VALUE); + } +} diff --git a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/EdgeIndexModule.java b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/EdgeIndexModule.java index 986f1874..361a7d47 100644 --- a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/EdgeIndexModule.java +++ b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/EdgeIndexModule.java @@ -13,13 +13,6 @@ public class EdgeIndexModule extends AbstractModule { public void configure() { - if (Boolean.getBoolean("small-ram")) { - bind(Long.class).annotatedWith(Names.named("edge-dictionary-hash-map-size")).toInstance(1L << 27); - } - else { - bind(Long.class).annotatedWith(Names.named("edge-dictionary-hash-map-size")).toInstance(1L << 31); - } - } @Provides diff --git a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/IndexServicesFactory.java b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/IndexServicesFactory.java index d069ec05..e81a1682 100644 --- a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/IndexServicesFactory.java +++ b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/IndexServicesFactory.java @@ -5,7 +5,7 @@ import com.google.inject.Singleton; import com.google.inject.name.Named; import lombok.SneakyThrows; import nu.marginalia.util.array.LongArray; -import nu.marginalia.util.dict.DictionaryHashMap; +import nu.marginalia.util.dict.DictionaryMap; import nu.marginalia.wmsa.edge.dbcommon.EdgeDomainBlacklist; import nu.marginalia.wmsa.edge.index.lexicon.KeywordLexicon; import nu.marginalia.wmsa.edge.index.lexicon.KeywordLexiconReadOnlyView; @@ -48,7 +48,6 @@ public class IndexServicesFactory { private final PartitionedDataFile revPrioIndexWords; private volatile static KeywordLexicon keywordLexicon; - private final Long dictionaryHashMapSize; private final Path searchSetsBase; @@ -60,12 +59,10 @@ public class IndexServicesFactory { @Named("tmp-file-dir") Path tmpFileDir, @Named("partition-root-slow") Path partitionRootSlow, @Named("partition-root-fast") Path partitionRootFast, - @Named("edge-dictionary-hash-map-size") Long dictionaryHashMapSize, EdgeDomainBlacklist domainBlacklist ) throws IOException { this.tmpFileDir = tmpFileDir; - this.dictionaryHashMapSize = dictionaryHashMapSize; this.domainBlacklist = domainBlacklist; this.writerIndexFile = new PartitionedDataFile(partitionRootSlow, "page-index.dat"); @@ -98,7 +95,7 @@ public class IndexServicesFactory { public KeywordLexicon getKeywordLexicon() { if (keywordLexicon == null) { final var journal = new KeywordLexiconJournal(keywordLexiconFile.get()); - keywordLexicon = new KeywordLexicon(journal, new DictionaryHashMap(dictionaryHashMapSize)); + keywordLexicon = new KeywordLexicon(journal, DictionaryMap.create()); } return keywordLexicon; } diff --git a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/client/EdgeIndexLocalService.java b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/client/EdgeIndexLocalService.java index 00e518e3..585c9a14 100644 --- a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/client/EdgeIndexLocalService.java +++ b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/client/EdgeIndexLocalService.java @@ -3,7 +3,8 @@ package nu.marginalia.wmsa.edge.index.client; import com.google.inject.Inject; import com.google.inject.Singleton; import com.google.inject.name.Named; -import nu.marginalia.util.dict.DictionaryHashMap; +import nu.marginalia.util.dict.OffHeapDictionaryHashMap; +import nu.marginalia.util.dict.DictionaryMap; import nu.marginalia.wmsa.configuration.server.Context; import nu.marginalia.wmsa.edge.converting.interpreter.instruction.DocumentKeywords; import nu.marginalia.wmsa.edge.converting.interpreter.instruction.KeywordListChunker; @@ -32,14 +33,9 @@ public class EdgeIndexLocalService implements EdgeIndexWriterClient { @Inject public EdgeIndexLocalService(@Named("local-index-path") Path path) throws IOException { - long hashMapSize = 1L << 31; - - if (Boolean.getBoolean("small-ram")) { - hashMapSize = 1L << 27; - } var lexiconJournal = new KeywordLexiconJournal(path.resolve("dictionary.dat").toFile()); - lexicon = new KeywordLexicon(lexiconJournal, new DictionaryHashMap(hashMapSize)); + lexicon = new KeywordLexicon(lexiconJournal, DictionaryMap.create()); indexWriter = new SearchIndexJournalWriterImpl(lexicon, path.resolve("index.dat").toFile()); } @@ -72,7 +68,7 @@ public class EdgeIndexLocalService implements EdgeIndexWriterClient { String word = words[i]; long id = lexicon.getOrInsert(word); - if (id != DictionaryHashMap.NO_VALUE) { + if (id != OffHeapDictionaryHashMap.NO_VALUE) { ids[putIdx++] = id; ids[putIdx++] = meta[i]; } diff --git a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/lexicon/KeywordLexicon.java b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/lexicon/KeywordLexicon.java index 27514b58..5f02cb98 100644 --- a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/lexicon/KeywordLexicon.java +++ b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/lexicon/KeywordLexicon.java @@ -4,7 +4,6 @@ import com.google.common.hash.HashFunction; import com.google.common.hash.Hashing; import io.prometheus.client.Gauge; import lombok.SneakyThrows; -import nu.marginalia.util.dict.DictionaryHashMap; import nu.marginalia.util.dict.DictionaryMap; import nu.marginalia.wmsa.edge.index.lexicon.journal.KeywordLexiconJournal; import org.slf4j.Logger; @@ -55,7 +54,7 @@ public class KeywordLexicon implements AutoCloseable { private int getOrInsert(byte[] bytes) { if (bytes.length >= Byte.MAX_VALUE) { logger.warn("getOrInsert({}), illegal length {}", new String(bytes), bytes.length); - return DictionaryHashMap.NO_VALUE; + return DictionaryMap.NO_VALUE; } final long key = hashFunction.hashBytes(bytes).padToLong(); diff --git a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/svc/EdgeIndexDomainQueryService.java b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/svc/EdgeIndexDomainQueryService.java index a51352c1..90ac84a4 100644 --- a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/svc/EdgeIndexDomainQueryService.java +++ b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/svc/EdgeIndexDomainQueryService.java @@ -5,7 +5,7 @@ import com.google.inject.Inject; import com.google.inject.Singleton; import io.prometheus.client.Histogram; import nu.marginalia.util.array.buffer.LongQueryBuffer; -import nu.marginalia.util.dict.DictionaryHashMap; +import nu.marginalia.util.dict.OffHeapDictionaryHashMap; import nu.marginalia.wmsa.client.GsonFactory; import nu.marginalia.wmsa.edge.index.postings.SearchIndexControl; import nu.marginalia.wmsa.edge.index.query.IndexResultDomainDeduplicator; @@ -101,7 +101,7 @@ public class EdgeIndexDomainQueryService { private OptionalInt lookUpWord(String s) { int ret = indexes.getLexiconReader().get(s); - if (ret == DictionaryHashMap.NO_VALUE) { + if (ret == OffHeapDictionaryHashMap.NO_VALUE) { return OptionalInt.empty(); } return OptionalInt.of(ret); diff --git a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/svc/EdgeIndexLexiconService.java b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/svc/EdgeIndexLexiconService.java index 7aa33038..40f7cf64 100644 --- a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/svc/EdgeIndexLexiconService.java +++ b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/svc/EdgeIndexLexiconService.java @@ -3,7 +3,7 @@ package nu.marginalia.wmsa.edge.index.svc; import com.google.inject.Inject; import com.google.inject.Singleton; import com.google.protobuf.InvalidProtocolBufferException; -import nu.marginalia.util.dict.DictionaryHashMap; +import nu.marginalia.util.dict.OffHeapDictionaryHashMap; import nu.marginalia.wmsa.edge.converting.interpreter.instruction.DocumentKeywords; import nu.marginalia.wmsa.edge.converting.interpreter.instruction.KeywordListChunker; import nu.marginalia.wmsa.edge.index.IndexServicesFactory; @@ -51,7 +51,7 @@ public class EdgeIndexLexiconService { final int wordId = lr.get(word); - if (DictionaryHashMap.NO_VALUE == wordId) { + if (OffHeapDictionaryHashMap.NO_VALUE == wordId) { response.status(404); return ""; } @@ -110,7 +110,7 @@ public class EdgeIndexLexiconService { String word = words[i]; long id = keywordLexicon.getOrInsert(word); - if (id != DictionaryHashMap.NO_VALUE) { + if (id != OffHeapDictionaryHashMap.NO_VALUE) { ids[putIdx++] = id; ids[putIdx++] = meta[i]; } diff --git a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/svc/EdgeIndexQueryService.java b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/svc/EdgeIndexQueryService.java index 0b8c08f4..16f100de 100644 --- a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/svc/EdgeIndexQueryService.java +++ b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/svc/EdgeIndexQueryService.java @@ -12,7 +12,7 @@ import io.prometheus.client.Histogram; import it.unimi.dsi.fastutil.ints.IntArrayList; import it.unimi.dsi.fastutil.ints.IntList; import nu.marginalia.util.array.buffer.LongQueryBuffer; -import nu.marginalia.util.dict.DictionaryHashMap; +import nu.marginalia.util.dict.OffHeapDictionaryHashMap; import nu.marginalia.wmsa.client.GsonFactory; import nu.marginalia.wmsa.edge.index.postings.EdgeIndexQuerySearchTerms; import nu.marginalia.wmsa.edge.index.postings.IndexResultValuator; @@ -293,7 +293,7 @@ public class EdgeIndexQueryService { private OptionalInt lookUpWord(String s) { int ret = indexes.getLexiconReader().get(s); - if (ret == DictionaryHashMap.NO_VALUE) { + if (ret == OffHeapDictionaryHashMap.NO_VALUE) { return OptionalInt.empty(); } return OptionalInt.of(ret); diff --git a/marginalia_nu/src/test/java/nu/marginalia/wmsa/edge/index/postings/forward/ForwardIndexConverterTest.java b/marginalia_nu/src/test/java/nu/marginalia/wmsa/edge/index/postings/forward/ForwardIndexConverterTest.java index a4b97a7a..3bea5500 100644 --- a/marginalia_nu/src/test/java/nu/marginalia/wmsa/edge/index/postings/forward/ForwardIndexConverterTest.java +++ b/marginalia_nu/src/test/java/nu/marginalia/wmsa/edge/index/postings/forward/ForwardIndexConverterTest.java @@ -2,7 +2,7 @@ package nu.marginalia.wmsa.edge.index.postings.forward; import lombok.SneakyThrows; import nu.marginalia.util.array.LongArray; -import nu.marginalia.util.dict.DictionaryHashMap; +import nu.marginalia.util.dict.OffHeapDictionaryHashMap; import nu.marginalia.util.test.TestUtil; import nu.marginalia.wmsa.edge.index.lexicon.KeywordLexicon; import nu.marginalia.wmsa.edge.index.lexicon.journal.KeywordLexiconJournal; @@ -47,7 +47,7 @@ class ForwardIndexConverterTest { dictionaryFile = Files.createTempFile("tmp", ".dict"); dictionaryFile.toFile().deleteOnExit(); - keywordLexicon = new KeywordLexicon(new KeywordLexiconJournal(dictionaryFile.toFile()), new DictionaryHashMap(1L<<18)); + keywordLexicon = new KeywordLexicon(new KeywordLexiconJournal(dictionaryFile.toFile()), new OffHeapDictionaryHashMap(1L<<18)); keywordLexicon.getOrInsert("0"); indexFile = Files.createTempFile("tmp", ".idx"); diff --git a/marginalia_nu/src/test/java/nu/marginalia/wmsa/edge/index/postings/reverse/ReverseIndexConverterTest.java b/marginalia_nu/src/test/java/nu/marginalia/wmsa/edge/index/postings/reverse/ReverseIndexConverterTest.java index 4c210c54..2fc9d36a 100644 --- a/marginalia_nu/src/test/java/nu/marginalia/wmsa/edge/index/postings/reverse/ReverseIndexConverterTest.java +++ b/marginalia_nu/src/test/java/nu/marginalia/wmsa/edge/index/postings/reverse/ReverseIndexConverterTest.java @@ -3,7 +3,7 @@ package nu.marginalia.wmsa.edge.index.postings.reverse; import lombok.SneakyThrows; import nu.marginalia.util.array.LongArray; import nu.marginalia.util.array.buffer.LongQueryBuffer; -import nu.marginalia.util.dict.DictionaryHashMap; +import nu.marginalia.util.dict.OffHeapDictionaryHashMap; import nu.marginalia.util.test.TestUtil; import nu.marginalia.wmsa.edge.index.lexicon.KeywordLexicon; import nu.marginalia.wmsa.edge.index.lexicon.journal.KeywordLexiconJournal; @@ -43,7 +43,7 @@ class ReverseIndexConverterTest { dictionaryFile = Files.createTempFile("tmp", ".dict"); dictionaryFile.toFile().deleteOnExit(); - keywordLexicon = new KeywordLexicon(new KeywordLexiconJournal(dictionaryFile.toFile()), new DictionaryHashMap(1L<<16)); + keywordLexicon = new KeywordLexicon(new KeywordLexiconJournal(dictionaryFile.toFile()), new OffHeapDictionaryHashMap(1L<<16)); keywordLexicon.getOrInsert("0"); indexFile = Files.createTempFile("tmp", ".idx"); diff --git a/marginalia_nu/src/test/java/nu/marginalia/wmsa/edge/index/postings/reverse/ReverseIndexConverterTest2.java b/marginalia_nu/src/test/java/nu/marginalia/wmsa/edge/index/postings/reverse/ReverseIndexConverterTest2.java index 6efcbbd3..6827e691 100644 --- a/marginalia_nu/src/test/java/nu/marginalia/wmsa/edge/index/postings/reverse/ReverseIndexConverterTest2.java +++ b/marginalia_nu/src/test/java/nu/marginalia/wmsa/edge/index/postings/reverse/ReverseIndexConverterTest2.java @@ -3,7 +3,7 @@ package nu.marginalia.wmsa.edge.index.postings.reverse; import lombok.SneakyThrows; import nu.marginalia.util.array.LongArray; import nu.marginalia.util.array.buffer.LongQueryBuffer; -import nu.marginalia.util.dict.DictionaryHashMap; +import nu.marginalia.util.dict.OffHeapDictionaryHashMap; import nu.marginalia.util.test.TestUtil; import nu.marginalia.wmsa.edge.index.lexicon.KeywordLexicon; import nu.marginalia.wmsa.edge.index.lexicon.journal.KeywordLexiconJournal; @@ -50,7 +50,7 @@ class ReverseIndexConverterTest2 { dictionaryFile = Files.createTempFile("tmp", ".dict"); dictionaryFile.toFile().deleteOnExit(); - keywordLexicon = new KeywordLexicon(new KeywordLexiconJournal(dictionaryFile.toFile()), new DictionaryHashMap(1L<<18)); + keywordLexicon = new KeywordLexicon(new KeywordLexiconJournal(dictionaryFile.toFile()), new OffHeapDictionaryHashMap(1L<<18)); keywordLexicon.getOrInsert("0"); indexFile = Files.createTempFile("tmp", ".idx"); diff --git a/marginalia_nu/src/test/java/nu/marginalia/wmsa/edge/index/service/EdgeIndexIntegrationTestModule.java b/marginalia_nu/src/test/java/nu/marginalia/wmsa/edge/index/service/EdgeIndexIntegrationTestModule.java index 46d8228c..2914c4f9 100644 --- a/marginalia_nu/src/test/java/nu/marginalia/wmsa/edge/index/service/EdgeIndexIntegrationTestModule.java +++ b/marginalia_nu/src/test/java/nu/marginalia/wmsa/edge/index/service/EdgeIndexIntegrationTestModule.java @@ -39,11 +39,10 @@ public class EdgeIndexIntegrationTestModule extends AbstractModule { @Override protected void configure() { + System.setProperty("small-ram", "true"); try { bind(IndexServicesFactory.class).toInstance(new IndexServicesFactory(Path.of("/tmp"), - slowDir, fastDir, - 1L<<24, - null + slowDir, fastDir, null )); EdgeIndexSearchSetsService setsServiceMock = Mockito.mock(EdgeIndexSearchSetsService.class);