Use on-heap dictionary for small data.

This commit is contained in:
Viktor Lofgren 2023-01-30 13:10:56 +01:00
parent 4a6a1308b0
commit 4c2f54593e
15 changed files with 59 additions and 46 deletions

View File

@ -1,8 +1,5 @@
package nu.marginalia.util.dict; package nu.marginalia.util.dict;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import java.nio.ByteBuffer; import java.nio.ByteBuffer;
import java.nio.LongBuffer; import java.nio.LongBuffer;
import java.util.ArrayList; import java.util.ArrayList;
@ -10,7 +7,6 @@ import java.util.ArrayList;
public class DictionaryData { public class DictionaryData {
private final int DICTIONARY_BANK_SIZE; private final int DICTIONARY_BANK_SIZE;
private static final Logger logger = LoggerFactory.getLogger(DictionaryData.class);
private final ArrayList<DictionaryDataBank> banks = new ArrayList<>(100); private final ArrayList<DictionaryDataBank> banks = new ArrayList<>(100);

View File

@ -1,6 +1,17 @@
package nu.marginalia.util.dict; package nu.marginalia.util.dict;
public interface DictionaryMap { public interface DictionaryMap {
int NO_VALUE = Integer.MIN_VALUE;
static DictionaryMap create() {
if (Boolean.getBoolean("small-ram")) {
return new OnHeapDictionaryMap();
}
else {
return new OffHeapDictionaryHashMap(1L << 31);
}
}
int size(); int size();
int put(long key); int put(long key);

View File

@ -16,15 +16,14 @@ import static nu.marginalia.util.FileSizeUtil.readableSize;
* Spiritually influenced by GNU Trove's hash maps * Spiritually influenced by GNU Trove's hash maps
* LGPL 2.1 * LGPL 2.1
*/ */
public class DictionaryHashMap implements DictionaryMap { public class OffHeapDictionaryHashMap implements DictionaryMap {
private static final Logger logger = LoggerFactory.getLogger(DictionaryHashMap.class); private static final Logger logger = LoggerFactory.getLogger(OffHeapDictionaryHashMap.class);
private static final Gauge probe_count_metrics private static final Gauge probe_count_metrics
= Gauge.build("wmsa_dictionary_hash_map_probe_count", "Probing Count") = Gauge.build("wmsa_dictionary_hash_map_probe_count", "Probing Count")
.register(); .register();
private final int bufferCount; private final int bufferCount;
private final IntBuffer[] buffers; private final IntBuffer[] buffers;
public static final int NO_VALUE = Integer.MIN_VALUE;
private final DictionaryData dictionaryData; private final DictionaryData dictionaryData;
@ -35,7 +34,7 @@ public class DictionaryHashMap implements DictionaryMap {
private final AtomicInteger sz = new AtomicInteger(0); private final AtomicInteger sz = new AtomicInteger(0);
public DictionaryHashMap(long sizeMemory) { public OffHeapDictionaryHashMap(long sizeMemory) {
final int intSize = 4; final int intSize = 4;
bufferCount = 1 + (int) ((intSize*sizeMemory) / (1<<30)); bufferCount = 1 + (int) ((intSize*sizeMemory) / (1<<30));

View File

@ -0,0 +1,23 @@
package nu.marginalia.util.dict;
import it.unimi.dsi.fastutil.longs.Long2IntOpenHashMap;
public class OnHeapDictionaryMap implements DictionaryMap {
private final Long2IntOpenHashMap entries = new Long2IntOpenHashMap(100_000, 0.75f);
@Override
public int size() {
return entries.size();
}
@Override
public int put(long key) {
entries.putIfAbsent(key, entries.size());
return get(key);
}
@Override
public int get(long key) {
return entries.getOrDefault(key, NO_VALUE);
}
}

View File

@ -13,13 +13,6 @@ public class EdgeIndexModule extends AbstractModule {
public void configure() { public void configure() {
if (Boolean.getBoolean("small-ram")) {
bind(Long.class).annotatedWith(Names.named("edge-dictionary-hash-map-size")).toInstance(1L << 27);
}
else {
bind(Long.class).annotatedWith(Names.named("edge-dictionary-hash-map-size")).toInstance(1L << 31);
}
} }
@Provides @Provides

View File

@ -5,7 +5,7 @@ import com.google.inject.Singleton;
import com.google.inject.name.Named; import com.google.inject.name.Named;
import lombok.SneakyThrows; import lombok.SneakyThrows;
import nu.marginalia.util.array.LongArray; import nu.marginalia.util.array.LongArray;
import nu.marginalia.util.dict.DictionaryHashMap; import nu.marginalia.util.dict.DictionaryMap;
import nu.marginalia.wmsa.edge.dbcommon.EdgeDomainBlacklist; import nu.marginalia.wmsa.edge.dbcommon.EdgeDomainBlacklist;
import nu.marginalia.wmsa.edge.index.lexicon.KeywordLexicon; import nu.marginalia.wmsa.edge.index.lexicon.KeywordLexicon;
import nu.marginalia.wmsa.edge.index.lexicon.KeywordLexiconReadOnlyView; import nu.marginalia.wmsa.edge.index.lexicon.KeywordLexiconReadOnlyView;
@ -48,7 +48,6 @@ public class IndexServicesFactory {
private final PartitionedDataFile revPrioIndexWords; private final PartitionedDataFile revPrioIndexWords;
private volatile static KeywordLexicon keywordLexicon; private volatile static KeywordLexicon keywordLexicon;
private final Long dictionaryHashMapSize;
private final Path searchSetsBase; private final Path searchSetsBase;
@ -60,12 +59,10 @@ public class IndexServicesFactory {
@Named("tmp-file-dir") Path tmpFileDir, @Named("tmp-file-dir") Path tmpFileDir,
@Named("partition-root-slow") Path partitionRootSlow, @Named("partition-root-slow") Path partitionRootSlow,
@Named("partition-root-fast") Path partitionRootFast, @Named("partition-root-fast") Path partitionRootFast,
@Named("edge-dictionary-hash-map-size") Long dictionaryHashMapSize,
EdgeDomainBlacklist domainBlacklist EdgeDomainBlacklist domainBlacklist
) throws IOException { ) throws IOException {
this.tmpFileDir = tmpFileDir; this.tmpFileDir = tmpFileDir;
this.dictionaryHashMapSize = dictionaryHashMapSize;
this.domainBlacklist = domainBlacklist; this.domainBlacklist = domainBlacklist;
this.writerIndexFile = new PartitionedDataFile(partitionRootSlow, "page-index.dat"); this.writerIndexFile = new PartitionedDataFile(partitionRootSlow, "page-index.dat");
@ -98,7 +95,7 @@ public class IndexServicesFactory {
public KeywordLexicon getKeywordLexicon() { public KeywordLexicon getKeywordLexicon() {
if (keywordLexicon == null) { if (keywordLexicon == null) {
final var journal = new KeywordLexiconJournal(keywordLexiconFile.get()); final var journal = new KeywordLexiconJournal(keywordLexiconFile.get());
keywordLexicon = new KeywordLexicon(journal, new DictionaryHashMap(dictionaryHashMapSize)); keywordLexicon = new KeywordLexicon(journal, DictionaryMap.create());
} }
return keywordLexicon; return keywordLexicon;
} }

View File

@ -3,7 +3,8 @@ package nu.marginalia.wmsa.edge.index.client;
import com.google.inject.Inject; import com.google.inject.Inject;
import com.google.inject.Singleton; import com.google.inject.Singleton;
import com.google.inject.name.Named; import com.google.inject.name.Named;
import nu.marginalia.util.dict.DictionaryHashMap; import nu.marginalia.util.dict.OffHeapDictionaryHashMap;
import nu.marginalia.util.dict.DictionaryMap;
import nu.marginalia.wmsa.configuration.server.Context; import nu.marginalia.wmsa.configuration.server.Context;
import nu.marginalia.wmsa.edge.converting.interpreter.instruction.DocumentKeywords; import nu.marginalia.wmsa.edge.converting.interpreter.instruction.DocumentKeywords;
import nu.marginalia.wmsa.edge.converting.interpreter.instruction.KeywordListChunker; import nu.marginalia.wmsa.edge.converting.interpreter.instruction.KeywordListChunker;
@ -32,14 +33,9 @@ public class EdgeIndexLocalService implements EdgeIndexWriterClient {
@Inject @Inject
public EdgeIndexLocalService(@Named("local-index-path") Path path) throws IOException { public EdgeIndexLocalService(@Named("local-index-path") Path path) throws IOException {
long hashMapSize = 1L << 31;
if (Boolean.getBoolean("small-ram")) {
hashMapSize = 1L << 27;
}
var lexiconJournal = new KeywordLexiconJournal(path.resolve("dictionary.dat").toFile()); var lexiconJournal = new KeywordLexiconJournal(path.resolve("dictionary.dat").toFile());
lexicon = new KeywordLexicon(lexiconJournal, new DictionaryHashMap(hashMapSize)); lexicon = new KeywordLexicon(lexiconJournal, DictionaryMap.create());
indexWriter = new SearchIndexJournalWriterImpl(lexicon, path.resolve("index.dat").toFile()); indexWriter = new SearchIndexJournalWriterImpl(lexicon, path.resolve("index.dat").toFile());
} }
@ -72,7 +68,7 @@ public class EdgeIndexLocalService implements EdgeIndexWriterClient {
String word = words[i]; String word = words[i];
long id = lexicon.getOrInsert(word); long id = lexicon.getOrInsert(word);
if (id != DictionaryHashMap.NO_VALUE) { if (id != OffHeapDictionaryHashMap.NO_VALUE) {
ids[putIdx++] = id; ids[putIdx++] = id;
ids[putIdx++] = meta[i]; ids[putIdx++] = meta[i];
} }

View File

@ -4,7 +4,6 @@ import com.google.common.hash.HashFunction;
import com.google.common.hash.Hashing; import com.google.common.hash.Hashing;
import io.prometheus.client.Gauge; import io.prometheus.client.Gauge;
import lombok.SneakyThrows; import lombok.SneakyThrows;
import nu.marginalia.util.dict.DictionaryHashMap;
import nu.marginalia.util.dict.DictionaryMap; import nu.marginalia.util.dict.DictionaryMap;
import nu.marginalia.wmsa.edge.index.lexicon.journal.KeywordLexiconJournal; import nu.marginalia.wmsa.edge.index.lexicon.journal.KeywordLexiconJournal;
import org.slf4j.Logger; import org.slf4j.Logger;
@ -55,7 +54,7 @@ public class KeywordLexicon implements AutoCloseable {
private int getOrInsert(byte[] bytes) { private int getOrInsert(byte[] bytes) {
if (bytes.length >= Byte.MAX_VALUE) { if (bytes.length >= Byte.MAX_VALUE) {
logger.warn("getOrInsert({}), illegal length {}", new String(bytes), bytes.length); logger.warn("getOrInsert({}), illegal length {}", new String(bytes), bytes.length);
return DictionaryHashMap.NO_VALUE; return DictionaryMap.NO_VALUE;
} }
final long key = hashFunction.hashBytes(bytes).padToLong(); final long key = hashFunction.hashBytes(bytes).padToLong();

View File

@ -5,7 +5,7 @@ import com.google.inject.Inject;
import com.google.inject.Singleton; import com.google.inject.Singleton;
import io.prometheus.client.Histogram; import io.prometheus.client.Histogram;
import nu.marginalia.util.array.buffer.LongQueryBuffer; import nu.marginalia.util.array.buffer.LongQueryBuffer;
import nu.marginalia.util.dict.DictionaryHashMap; import nu.marginalia.util.dict.OffHeapDictionaryHashMap;
import nu.marginalia.wmsa.client.GsonFactory; import nu.marginalia.wmsa.client.GsonFactory;
import nu.marginalia.wmsa.edge.index.postings.SearchIndexControl; import nu.marginalia.wmsa.edge.index.postings.SearchIndexControl;
import nu.marginalia.wmsa.edge.index.query.IndexResultDomainDeduplicator; import nu.marginalia.wmsa.edge.index.query.IndexResultDomainDeduplicator;
@ -101,7 +101,7 @@ public class EdgeIndexDomainQueryService {
private OptionalInt lookUpWord(String s) { private OptionalInt lookUpWord(String s) {
int ret = indexes.getLexiconReader().get(s); int ret = indexes.getLexiconReader().get(s);
if (ret == DictionaryHashMap.NO_VALUE) { if (ret == OffHeapDictionaryHashMap.NO_VALUE) {
return OptionalInt.empty(); return OptionalInt.empty();
} }
return OptionalInt.of(ret); return OptionalInt.of(ret);

View File

@ -3,7 +3,7 @@ package nu.marginalia.wmsa.edge.index.svc;
import com.google.inject.Inject; import com.google.inject.Inject;
import com.google.inject.Singleton; import com.google.inject.Singleton;
import com.google.protobuf.InvalidProtocolBufferException; import com.google.protobuf.InvalidProtocolBufferException;
import nu.marginalia.util.dict.DictionaryHashMap; import nu.marginalia.util.dict.OffHeapDictionaryHashMap;
import nu.marginalia.wmsa.edge.converting.interpreter.instruction.DocumentKeywords; import nu.marginalia.wmsa.edge.converting.interpreter.instruction.DocumentKeywords;
import nu.marginalia.wmsa.edge.converting.interpreter.instruction.KeywordListChunker; import nu.marginalia.wmsa.edge.converting.interpreter.instruction.KeywordListChunker;
import nu.marginalia.wmsa.edge.index.IndexServicesFactory; import nu.marginalia.wmsa.edge.index.IndexServicesFactory;
@ -51,7 +51,7 @@ public class EdgeIndexLexiconService {
final int wordId = lr.get(word); final int wordId = lr.get(word);
if (DictionaryHashMap.NO_VALUE == wordId) { if (OffHeapDictionaryHashMap.NO_VALUE == wordId) {
response.status(404); response.status(404);
return ""; return "";
} }
@ -110,7 +110,7 @@ public class EdgeIndexLexiconService {
String word = words[i]; String word = words[i];
long id = keywordLexicon.getOrInsert(word); long id = keywordLexicon.getOrInsert(word);
if (id != DictionaryHashMap.NO_VALUE) { if (id != OffHeapDictionaryHashMap.NO_VALUE) {
ids[putIdx++] = id; ids[putIdx++] = id;
ids[putIdx++] = meta[i]; ids[putIdx++] = meta[i];
} }

View File

@ -12,7 +12,7 @@ import io.prometheus.client.Histogram;
import it.unimi.dsi.fastutil.ints.IntArrayList; import it.unimi.dsi.fastutil.ints.IntArrayList;
import it.unimi.dsi.fastutil.ints.IntList; import it.unimi.dsi.fastutil.ints.IntList;
import nu.marginalia.util.array.buffer.LongQueryBuffer; import nu.marginalia.util.array.buffer.LongQueryBuffer;
import nu.marginalia.util.dict.DictionaryHashMap; import nu.marginalia.util.dict.OffHeapDictionaryHashMap;
import nu.marginalia.wmsa.client.GsonFactory; import nu.marginalia.wmsa.client.GsonFactory;
import nu.marginalia.wmsa.edge.index.postings.EdgeIndexQuerySearchTerms; import nu.marginalia.wmsa.edge.index.postings.EdgeIndexQuerySearchTerms;
import nu.marginalia.wmsa.edge.index.postings.IndexResultValuator; import nu.marginalia.wmsa.edge.index.postings.IndexResultValuator;
@ -293,7 +293,7 @@ public class EdgeIndexQueryService {
private OptionalInt lookUpWord(String s) { private OptionalInt lookUpWord(String s) {
int ret = indexes.getLexiconReader().get(s); int ret = indexes.getLexiconReader().get(s);
if (ret == DictionaryHashMap.NO_VALUE) { if (ret == OffHeapDictionaryHashMap.NO_VALUE) {
return OptionalInt.empty(); return OptionalInt.empty();
} }
return OptionalInt.of(ret); return OptionalInt.of(ret);

View File

@ -2,7 +2,7 @@ package nu.marginalia.wmsa.edge.index.postings.forward;
import lombok.SneakyThrows; import lombok.SneakyThrows;
import nu.marginalia.util.array.LongArray; import nu.marginalia.util.array.LongArray;
import nu.marginalia.util.dict.DictionaryHashMap; import nu.marginalia.util.dict.OffHeapDictionaryHashMap;
import nu.marginalia.util.test.TestUtil; import nu.marginalia.util.test.TestUtil;
import nu.marginalia.wmsa.edge.index.lexicon.KeywordLexicon; import nu.marginalia.wmsa.edge.index.lexicon.KeywordLexicon;
import nu.marginalia.wmsa.edge.index.lexicon.journal.KeywordLexiconJournal; import nu.marginalia.wmsa.edge.index.lexicon.journal.KeywordLexiconJournal;
@ -47,7 +47,7 @@ class ForwardIndexConverterTest {
dictionaryFile = Files.createTempFile("tmp", ".dict"); dictionaryFile = Files.createTempFile("tmp", ".dict");
dictionaryFile.toFile().deleteOnExit(); dictionaryFile.toFile().deleteOnExit();
keywordLexicon = new KeywordLexicon(new KeywordLexiconJournal(dictionaryFile.toFile()), new DictionaryHashMap(1L<<18)); keywordLexicon = new KeywordLexicon(new KeywordLexiconJournal(dictionaryFile.toFile()), new OffHeapDictionaryHashMap(1L<<18));
keywordLexicon.getOrInsert("0"); keywordLexicon.getOrInsert("0");
indexFile = Files.createTempFile("tmp", ".idx"); indexFile = Files.createTempFile("tmp", ".idx");

View File

@ -3,7 +3,7 @@ package nu.marginalia.wmsa.edge.index.postings.reverse;
import lombok.SneakyThrows; import lombok.SneakyThrows;
import nu.marginalia.util.array.LongArray; import nu.marginalia.util.array.LongArray;
import nu.marginalia.util.array.buffer.LongQueryBuffer; import nu.marginalia.util.array.buffer.LongQueryBuffer;
import nu.marginalia.util.dict.DictionaryHashMap; import nu.marginalia.util.dict.OffHeapDictionaryHashMap;
import nu.marginalia.util.test.TestUtil; import nu.marginalia.util.test.TestUtil;
import nu.marginalia.wmsa.edge.index.lexicon.KeywordLexicon; import nu.marginalia.wmsa.edge.index.lexicon.KeywordLexicon;
import nu.marginalia.wmsa.edge.index.lexicon.journal.KeywordLexiconJournal; import nu.marginalia.wmsa.edge.index.lexicon.journal.KeywordLexiconJournal;
@ -43,7 +43,7 @@ class ReverseIndexConverterTest {
dictionaryFile = Files.createTempFile("tmp", ".dict"); dictionaryFile = Files.createTempFile("tmp", ".dict");
dictionaryFile.toFile().deleteOnExit(); dictionaryFile.toFile().deleteOnExit();
keywordLexicon = new KeywordLexicon(new KeywordLexiconJournal(dictionaryFile.toFile()), new DictionaryHashMap(1L<<16)); keywordLexicon = new KeywordLexicon(new KeywordLexiconJournal(dictionaryFile.toFile()), new OffHeapDictionaryHashMap(1L<<16));
keywordLexicon.getOrInsert("0"); keywordLexicon.getOrInsert("0");
indexFile = Files.createTempFile("tmp", ".idx"); indexFile = Files.createTempFile("tmp", ".idx");

View File

@ -3,7 +3,7 @@ package nu.marginalia.wmsa.edge.index.postings.reverse;
import lombok.SneakyThrows; import lombok.SneakyThrows;
import nu.marginalia.util.array.LongArray; import nu.marginalia.util.array.LongArray;
import nu.marginalia.util.array.buffer.LongQueryBuffer; import nu.marginalia.util.array.buffer.LongQueryBuffer;
import nu.marginalia.util.dict.DictionaryHashMap; import nu.marginalia.util.dict.OffHeapDictionaryHashMap;
import nu.marginalia.util.test.TestUtil; import nu.marginalia.util.test.TestUtil;
import nu.marginalia.wmsa.edge.index.lexicon.KeywordLexicon; import nu.marginalia.wmsa.edge.index.lexicon.KeywordLexicon;
import nu.marginalia.wmsa.edge.index.lexicon.journal.KeywordLexiconJournal; import nu.marginalia.wmsa.edge.index.lexicon.journal.KeywordLexiconJournal;
@ -50,7 +50,7 @@ class ReverseIndexConverterTest2 {
dictionaryFile = Files.createTempFile("tmp", ".dict"); dictionaryFile = Files.createTempFile("tmp", ".dict");
dictionaryFile.toFile().deleteOnExit(); dictionaryFile.toFile().deleteOnExit();
keywordLexicon = new KeywordLexicon(new KeywordLexiconJournal(dictionaryFile.toFile()), new DictionaryHashMap(1L<<18)); keywordLexicon = new KeywordLexicon(new KeywordLexiconJournal(dictionaryFile.toFile()), new OffHeapDictionaryHashMap(1L<<18));
keywordLexicon.getOrInsert("0"); keywordLexicon.getOrInsert("0");
indexFile = Files.createTempFile("tmp", ".idx"); indexFile = Files.createTempFile("tmp", ".idx");

View File

@ -39,11 +39,10 @@ public class EdgeIndexIntegrationTestModule extends AbstractModule {
@Override @Override
protected void configure() { protected void configure() {
System.setProperty("small-ram", "true");
try { try {
bind(IndexServicesFactory.class).toInstance(new IndexServicesFactory(Path.of("/tmp"), bind(IndexServicesFactory.class).toInstance(new IndexServicesFactory(Path.of("/tmp"),
slowDir, fastDir, slowDir, fastDir, null
1L<<24,
null
)); ));
EdgeIndexSearchSetsService setsServiceMock = Mockito.mock(EdgeIndexSearchSetsService.class); EdgeIndexSearchSetsService setsServiceMock = Mockito.mock(EdgeIndexSearchSetsService.class);