Use on-heap dictionary for small data.

This commit is contained in:
Viktor Lofgren 2023-01-30 13:10:56 +01:00
parent 4a6a1308b0
commit 4c2f54593e
15 changed files with 59 additions and 46 deletions

View File

@ -1,8 +1,5 @@
package nu.marginalia.util.dict;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import java.nio.ByteBuffer;
import java.nio.LongBuffer;
import java.util.ArrayList;
@ -10,7 +7,6 @@ import java.util.ArrayList;
public class DictionaryData {
private final int DICTIONARY_BANK_SIZE;
private static final Logger logger = LoggerFactory.getLogger(DictionaryData.class);
private final ArrayList<DictionaryDataBank> banks = new ArrayList<>(100);

View File

@ -1,6 +1,17 @@
package nu.marginalia.util.dict;
public interface DictionaryMap {
int NO_VALUE = Integer.MIN_VALUE;
static DictionaryMap create() {
if (Boolean.getBoolean("small-ram")) {
return new OnHeapDictionaryMap();
}
else {
return new OffHeapDictionaryHashMap(1L << 31);
}
}
int size();
int put(long key);

View File

@ -16,15 +16,14 @@ import static nu.marginalia.util.FileSizeUtil.readableSize;
* Spiritually influenced by GNU Trove's hash maps
* LGPL 2.1
*/
public class DictionaryHashMap implements DictionaryMap {
private static final Logger logger = LoggerFactory.getLogger(DictionaryHashMap.class);
public class OffHeapDictionaryHashMap implements DictionaryMap {
private static final Logger logger = LoggerFactory.getLogger(OffHeapDictionaryHashMap.class);
private static final Gauge probe_count_metrics
= Gauge.build("wmsa_dictionary_hash_map_probe_count", "Probing Count")
.register();
private final int bufferCount;
private final IntBuffer[] buffers;
public static final int NO_VALUE = Integer.MIN_VALUE;
private final DictionaryData dictionaryData;
@ -35,7 +34,7 @@ public class DictionaryHashMap implements DictionaryMap {
private final AtomicInteger sz = new AtomicInteger(0);
public DictionaryHashMap(long sizeMemory) {
public OffHeapDictionaryHashMap(long sizeMemory) {
final int intSize = 4;
bufferCount = 1 + (int) ((intSize*sizeMemory) / (1<<30));

View File

@ -0,0 +1,23 @@
package nu.marginalia.util.dict;
import it.unimi.dsi.fastutil.longs.Long2IntOpenHashMap;
public class OnHeapDictionaryMap implements DictionaryMap {
private final Long2IntOpenHashMap entries = new Long2IntOpenHashMap(100_000, 0.75f);
@Override
public int size() {
return entries.size();
}
@Override
public int put(long key) {
entries.putIfAbsent(key, entries.size());
return get(key);
}
@Override
public int get(long key) {
return entries.getOrDefault(key, NO_VALUE);
}
}

View File

@ -13,13 +13,6 @@ public class EdgeIndexModule extends AbstractModule {
public void configure() {
if (Boolean.getBoolean("small-ram")) {
bind(Long.class).annotatedWith(Names.named("edge-dictionary-hash-map-size")).toInstance(1L << 27);
}
else {
bind(Long.class).annotatedWith(Names.named("edge-dictionary-hash-map-size")).toInstance(1L << 31);
}
}
@Provides

View File

@ -5,7 +5,7 @@ import com.google.inject.Singleton;
import com.google.inject.name.Named;
import lombok.SneakyThrows;
import nu.marginalia.util.array.LongArray;
import nu.marginalia.util.dict.DictionaryHashMap;
import nu.marginalia.util.dict.DictionaryMap;
import nu.marginalia.wmsa.edge.dbcommon.EdgeDomainBlacklist;
import nu.marginalia.wmsa.edge.index.lexicon.KeywordLexicon;
import nu.marginalia.wmsa.edge.index.lexicon.KeywordLexiconReadOnlyView;
@ -48,7 +48,6 @@ public class IndexServicesFactory {
private final PartitionedDataFile revPrioIndexWords;
private volatile static KeywordLexicon keywordLexicon;
private final Long dictionaryHashMapSize;
private final Path searchSetsBase;
@ -60,12 +59,10 @@ public class IndexServicesFactory {
@Named("tmp-file-dir") Path tmpFileDir,
@Named("partition-root-slow") Path partitionRootSlow,
@Named("partition-root-fast") Path partitionRootFast,
@Named("edge-dictionary-hash-map-size") Long dictionaryHashMapSize,
EdgeDomainBlacklist domainBlacklist
) throws IOException {
this.tmpFileDir = tmpFileDir;
this.dictionaryHashMapSize = dictionaryHashMapSize;
this.domainBlacklist = domainBlacklist;
this.writerIndexFile = new PartitionedDataFile(partitionRootSlow, "page-index.dat");
@ -98,7 +95,7 @@ public class IndexServicesFactory {
public KeywordLexicon getKeywordLexicon() {
if (keywordLexicon == null) {
final var journal = new KeywordLexiconJournal(keywordLexiconFile.get());
keywordLexicon = new KeywordLexicon(journal, new DictionaryHashMap(dictionaryHashMapSize));
keywordLexicon = new KeywordLexicon(journal, DictionaryMap.create());
}
return keywordLexicon;
}

View File

@ -3,7 +3,8 @@ package nu.marginalia.wmsa.edge.index.client;
import com.google.inject.Inject;
import com.google.inject.Singleton;
import com.google.inject.name.Named;
import nu.marginalia.util.dict.DictionaryHashMap;
import nu.marginalia.util.dict.OffHeapDictionaryHashMap;
import nu.marginalia.util.dict.DictionaryMap;
import nu.marginalia.wmsa.configuration.server.Context;
import nu.marginalia.wmsa.edge.converting.interpreter.instruction.DocumentKeywords;
import nu.marginalia.wmsa.edge.converting.interpreter.instruction.KeywordListChunker;
@ -32,14 +33,9 @@ public class EdgeIndexLocalService implements EdgeIndexWriterClient {
@Inject
public EdgeIndexLocalService(@Named("local-index-path") Path path) throws IOException {
long hashMapSize = 1L << 31;
if (Boolean.getBoolean("small-ram")) {
hashMapSize = 1L << 27;
}
var lexiconJournal = new KeywordLexiconJournal(path.resolve("dictionary.dat").toFile());
lexicon = new KeywordLexicon(lexiconJournal, new DictionaryHashMap(hashMapSize));
lexicon = new KeywordLexicon(lexiconJournal, DictionaryMap.create());
indexWriter = new SearchIndexJournalWriterImpl(lexicon, path.resolve("index.dat").toFile());
}
@ -72,7 +68,7 @@ public class EdgeIndexLocalService implements EdgeIndexWriterClient {
String word = words[i];
long id = lexicon.getOrInsert(word);
if (id != DictionaryHashMap.NO_VALUE) {
if (id != OffHeapDictionaryHashMap.NO_VALUE) {
ids[putIdx++] = id;
ids[putIdx++] = meta[i];
}

View File

@ -4,7 +4,6 @@ import com.google.common.hash.HashFunction;
import com.google.common.hash.Hashing;
import io.prometheus.client.Gauge;
import lombok.SneakyThrows;
import nu.marginalia.util.dict.DictionaryHashMap;
import nu.marginalia.util.dict.DictionaryMap;
import nu.marginalia.wmsa.edge.index.lexicon.journal.KeywordLexiconJournal;
import org.slf4j.Logger;
@ -55,7 +54,7 @@ public class KeywordLexicon implements AutoCloseable {
private int getOrInsert(byte[] bytes) {
if (bytes.length >= Byte.MAX_VALUE) {
logger.warn("getOrInsert({}), illegal length {}", new String(bytes), bytes.length);
return DictionaryHashMap.NO_VALUE;
return DictionaryMap.NO_VALUE;
}
final long key = hashFunction.hashBytes(bytes).padToLong();

View File

@ -5,7 +5,7 @@ import com.google.inject.Inject;
import com.google.inject.Singleton;
import io.prometheus.client.Histogram;
import nu.marginalia.util.array.buffer.LongQueryBuffer;
import nu.marginalia.util.dict.DictionaryHashMap;
import nu.marginalia.util.dict.OffHeapDictionaryHashMap;
import nu.marginalia.wmsa.client.GsonFactory;
import nu.marginalia.wmsa.edge.index.postings.SearchIndexControl;
import nu.marginalia.wmsa.edge.index.query.IndexResultDomainDeduplicator;
@ -101,7 +101,7 @@ public class EdgeIndexDomainQueryService {
private OptionalInt lookUpWord(String s) {
int ret = indexes.getLexiconReader().get(s);
if (ret == DictionaryHashMap.NO_VALUE) {
if (ret == OffHeapDictionaryHashMap.NO_VALUE) {
return OptionalInt.empty();
}
return OptionalInt.of(ret);

View File

@ -3,7 +3,7 @@ package nu.marginalia.wmsa.edge.index.svc;
import com.google.inject.Inject;
import com.google.inject.Singleton;
import com.google.protobuf.InvalidProtocolBufferException;
import nu.marginalia.util.dict.DictionaryHashMap;
import nu.marginalia.util.dict.OffHeapDictionaryHashMap;
import nu.marginalia.wmsa.edge.converting.interpreter.instruction.DocumentKeywords;
import nu.marginalia.wmsa.edge.converting.interpreter.instruction.KeywordListChunker;
import nu.marginalia.wmsa.edge.index.IndexServicesFactory;
@ -51,7 +51,7 @@ public class EdgeIndexLexiconService {
final int wordId = lr.get(word);
if (DictionaryHashMap.NO_VALUE == wordId) {
if (OffHeapDictionaryHashMap.NO_VALUE == wordId) {
response.status(404);
return "";
}
@ -110,7 +110,7 @@ public class EdgeIndexLexiconService {
String word = words[i];
long id = keywordLexicon.getOrInsert(word);
if (id != DictionaryHashMap.NO_VALUE) {
if (id != OffHeapDictionaryHashMap.NO_VALUE) {
ids[putIdx++] = id;
ids[putIdx++] = meta[i];
}

View File

@ -12,7 +12,7 @@ import io.prometheus.client.Histogram;
import it.unimi.dsi.fastutil.ints.IntArrayList;
import it.unimi.dsi.fastutil.ints.IntList;
import nu.marginalia.util.array.buffer.LongQueryBuffer;
import nu.marginalia.util.dict.DictionaryHashMap;
import nu.marginalia.util.dict.OffHeapDictionaryHashMap;
import nu.marginalia.wmsa.client.GsonFactory;
import nu.marginalia.wmsa.edge.index.postings.EdgeIndexQuerySearchTerms;
import nu.marginalia.wmsa.edge.index.postings.IndexResultValuator;
@ -293,7 +293,7 @@ public class EdgeIndexQueryService {
private OptionalInt lookUpWord(String s) {
int ret = indexes.getLexiconReader().get(s);
if (ret == DictionaryHashMap.NO_VALUE) {
if (ret == OffHeapDictionaryHashMap.NO_VALUE) {
return OptionalInt.empty();
}
return OptionalInt.of(ret);

View File

@ -2,7 +2,7 @@ package nu.marginalia.wmsa.edge.index.postings.forward;
import lombok.SneakyThrows;
import nu.marginalia.util.array.LongArray;
import nu.marginalia.util.dict.DictionaryHashMap;
import nu.marginalia.util.dict.OffHeapDictionaryHashMap;
import nu.marginalia.util.test.TestUtil;
import nu.marginalia.wmsa.edge.index.lexicon.KeywordLexicon;
import nu.marginalia.wmsa.edge.index.lexicon.journal.KeywordLexiconJournal;
@ -47,7 +47,7 @@ class ForwardIndexConverterTest {
dictionaryFile = Files.createTempFile("tmp", ".dict");
dictionaryFile.toFile().deleteOnExit();
keywordLexicon = new KeywordLexicon(new KeywordLexiconJournal(dictionaryFile.toFile()), new DictionaryHashMap(1L<<18));
keywordLexicon = new KeywordLexicon(new KeywordLexiconJournal(dictionaryFile.toFile()), new OffHeapDictionaryHashMap(1L<<18));
keywordLexicon.getOrInsert("0");
indexFile = Files.createTempFile("tmp", ".idx");

View File

@ -3,7 +3,7 @@ package nu.marginalia.wmsa.edge.index.postings.reverse;
import lombok.SneakyThrows;
import nu.marginalia.util.array.LongArray;
import nu.marginalia.util.array.buffer.LongQueryBuffer;
import nu.marginalia.util.dict.DictionaryHashMap;
import nu.marginalia.util.dict.OffHeapDictionaryHashMap;
import nu.marginalia.util.test.TestUtil;
import nu.marginalia.wmsa.edge.index.lexicon.KeywordLexicon;
import nu.marginalia.wmsa.edge.index.lexicon.journal.KeywordLexiconJournal;
@ -43,7 +43,7 @@ class ReverseIndexConverterTest {
dictionaryFile = Files.createTempFile("tmp", ".dict");
dictionaryFile.toFile().deleteOnExit();
keywordLexicon = new KeywordLexicon(new KeywordLexiconJournal(dictionaryFile.toFile()), new DictionaryHashMap(1L<<16));
keywordLexicon = new KeywordLexicon(new KeywordLexiconJournal(dictionaryFile.toFile()), new OffHeapDictionaryHashMap(1L<<16));
keywordLexicon.getOrInsert("0");
indexFile = Files.createTempFile("tmp", ".idx");

View File

@ -3,7 +3,7 @@ package nu.marginalia.wmsa.edge.index.postings.reverse;
import lombok.SneakyThrows;
import nu.marginalia.util.array.LongArray;
import nu.marginalia.util.array.buffer.LongQueryBuffer;
import nu.marginalia.util.dict.DictionaryHashMap;
import nu.marginalia.util.dict.OffHeapDictionaryHashMap;
import nu.marginalia.util.test.TestUtil;
import nu.marginalia.wmsa.edge.index.lexicon.KeywordLexicon;
import nu.marginalia.wmsa.edge.index.lexicon.journal.KeywordLexiconJournal;
@ -50,7 +50,7 @@ class ReverseIndexConverterTest2 {
dictionaryFile = Files.createTempFile("tmp", ".dict");
dictionaryFile.toFile().deleteOnExit();
keywordLexicon = new KeywordLexicon(new KeywordLexiconJournal(dictionaryFile.toFile()), new DictionaryHashMap(1L<<18));
keywordLexicon = new KeywordLexicon(new KeywordLexiconJournal(dictionaryFile.toFile()), new OffHeapDictionaryHashMap(1L<<18));
keywordLexicon.getOrInsert("0");
indexFile = Files.createTempFile("tmp", ".idx");

View File

@ -39,11 +39,10 @@ public class EdgeIndexIntegrationTestModule extends AbstractModule {
@Override
protected void configure() {
System.setProperty("small-ram", "true");
try {
bind(IndexServicesFactory.class).toInstance(new IndexServicesFactory(Path.of("/tmp"),
slowDir, fastDir,
1L<<24,
null
slowDir, fastDir, null
));
EdgeIndexSearchSetsService setsServiceMock = Mockito.mock(EdgeIndexSearchSetsService.class);