mirror of
https://github.com/MarginaliaSearch/MarginaliaSearch.git
synced 2025-02-23 21:18:58 +00:00
Use on-heap dictionary for small data.
This commit is contained in:
parent
4a6a1308b0
commit
4c2f54593e
@ -1,8 +1,5 @@
|
||||
package nu.marginalia.util.dict;
|
||||
|
||||
import org.slf4j.Logger;
|
||||
import org.slf4j.LoggerFactory;
|
||||
|
||||
import java.nio.ByteBuffer;
|
||||
import java.nio.LongBuffer;
|
||||
import java.util.ArrayList;
|
||||
@ -10,7 +7,6 @@ import java.util.ArrayList;
|
||||
public class DictionaryData {
|
||||
|
||||
private final int DICTIONARY_BANK_SIZE;
|
||||
private static final Logger logger = LoggerFactory.getLogger(DictionaryData.class);
|
||||
|
||||
private final ArrayList<DictionaryDataBank> banks = new ArrayList<>(100);
|
||||
|
||||
|
@ -1,6 +1,17 @@
|
||||
package nu.marginalia.util.dict;
|
||||
|
||||
public interface DictionaryMap {
|
||||
int NO_VALUE = Integer.MIN_VALUE;
|
||||
|
||||
static DictionaryMap create() {
|
||||
if (Boolean.getBoolean("small-ram")) {
|
||||
return new OnHeapDictionaryMap();
|
||||
}
|
||||
else {
|
||||
return new OffHeapDictionaryHashMap(1L << 31);
|
||||
}
|
||||
}
|
||||
|
||||
int size();
|
||||
|
||||
int put(long key);
|
||||
|
@ -16,15 +16,14 @@ import static nu.marginalia.util.FileSizeUtil.readableSize;
|
||||
* Spiritually influenced by GNU Trove's hash maps
|
||||
* LGPL 2.1
|
||||
*/
|
||||
public class DictionaryHashMap implements DictionaryMap {
|
||||
private static final Logger logger = LoggerFactory.getLogger(DictionaryHashMap.class);
|
||||
public class OffHeapDictionaryHashMap implements DictionaryMap {
|
||||
private static final Logger logger = LoggerFactory.getLogger(OffHeapDictionaryHashMap.class);
|
||||
private static final Gauge probe_count_metrics
|
||||
= Gauge.build("wmsa_dictionary_hash_map_probe_count", "Probing Count")
|
||||
.register();
|
||||
|
||||
private final int bufferCount;
|
||||
private final IntBuffer[] buffers;
|
||||
public static final int NO_VALUE = Integer.MIN_VALUE;
|
||||
|
||||
private final DictionaryData dictionaryData;
|
||||
|
||||
@ -35,7 +34,7 @@ public class DictionaryHashMap implements DictionaryMap {
|
||||
|
||||
private final AtomicInteger sz = new AtomicInteger(0);
|
||||
|
||||
public DictionaryHashMap(long sizeMemory) {
|
||||
public OffHeapDictionaryHashMap(long sizeMemory) {
|
||||
final int intSize = 4;
|
||||
|
||||
bufferCount = 1 + (int) ((intSize*sizeMemory) / (1<<30));
|
@ -0,0 +1,23 @@
|
||||
package nu.marginalia.util.dict;
|
||||
|
||||
import it.unimi.dsi.fastutil.longs.Long2IntOpenHashMap;
|
||||
|
||||
public class OnHeapDictionaryMap implements DictionaryMap {
|
||||
private final Long2IntOpenHashMap entries = new Long2IntOpenHashMap(100_000, 0.75f);
|
||||
|
||||
@Override
|
||||
public int size() {
|
||||
return entries.size();
|
||||
}
|
||||
|
||||
@Override
|
||||
public int put(long key) {
|
||||
entries.putIfAbsent(key, entries.size());
|
||||
return get(key);
|
||||
}
|
||||
|
||||
@Override
|
||||
public int get(long key) {
|
||||
return entries.getOrDefault(key, NO_VALUE);
|
||||
}
|
||||
}
|
@ -13,13 +13,6 @@ public class EdgeIndexModule extends AbstractModule {
|
||||
|
||||
|
||||
public void configure() {
|
||||
if (Boolean.getBoolean("small-ram")) {
|
||||
bind(Long.class).annotatedWith(Names.named("edge-dictionary-hash-map-size")).toInstance(1L << 27);
|
||||
}
|
||||
else {
|
||||
bind(Long.class).annotatedWith(Names.named("edge-dictionary-hash-map-size")).toInstance(1L << 31);
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
@Provides
|
||||
|
@ -5,7 +5,7 @@ import com.google.inject.Singleton;
|
||||
import com.google.inject.name.Named;
|
||||
import lombok.SneakyThrows;
|
||||
import nu.marginalia.util.array.LongArray;
|
||||
import nu.marginalia.util.dict.DictionaryHashMap;
|
||||
import nu.marginalia.util.dict.DictionaryMap;
|
||||
import nu.marginalia.wmsa.edge.dbcommon.EdgeDomainBlacklist;
|
||||
import nu.marginalia.wmsa.edge.index.lexicon.KeywordLexicon;
|
||||
import nu.marginalia.wmsa.edge.index.lexicon.KeywordLexiconReadOnlyView;
|
||||
@ -48,7 +48,6 @@ public class IndexServicesFactory {
|
||||
private final PartitionedDataFile revPrioIndexWords;
|
||||
|
||||
private volatile static KeywordLexicon keywordLexicon;
|
||||
private final Long dictionaryHashMapSize;
|
||||
|
||||
private final Path searchSetsBase;
|
||||
|
||||
@ -60,12 +59,10 @@ public class IndexServicesFactory {
|
||||
@Named("tmp-file-dir") Path tmpFileDir,
|
||||
@Named("partition-root-slow") Path partitionRootSlow,
|
||||
@Named("partition-root-fast") Path partitionRootFast,
|
||||
@Named("edge-dictionary-hash-map-size") Long dictionaryHashMapSize,
|
||||
EdgeDomainBlacklist domainBlacklist
|
||||
) throws IOException {
|
||||
|
||||
this.tmpFileDir = tmpFileDir;
|
||||
this.dictionaryHashMapSize = dictionaryHashMapSize;
|
||||
this.domainBlacklist = domainBlacklist;
|
||||
|
||||
this.writerIndexFile = new PartitionedDataFile(partitionRootSlow, "page-index.dat");
|
||||
@ -98,7 +95,7 @@ public class IndexServicesFactory {
|
||||
public KeywordLexicon getKeywordLexicon() {
|
||||
if (keywordLexicon == null) {
|
||||
final var journal = new KeywordLexiconJournal(keywordLexiconFile.get());
|
||||
keywordLexicon = new KeywordLexicon(journal, new DictionaryHashMap(dictionaryHashMapSize));
|
||||
keywordLexicon = new KeywordLexicon(journal, DictionaryMap.create());
|
||||
}
|
||||
return keywordLexicon;
|
||||
}
|
||||
|
@ -3,7 +3,8 @@ package nu.marginalia.wmsa.edge.index.client;
|
||||
import com.google.inject.Inject;
|
||||
import com.google.inject.Singleton;
|
||||
import com.google.inject.name.Named;
|
||||
import nu.marginalia.util.dict.DictionaryHashMap;
|
||||
import nu.marginalia.util.dict.OffHeapDictionaryHashMap;
|
||||
import nu.marginalia.util.dict.DictionaryMap;
|
||||
import nu.marginalia.wmsa.configuration.server.Context;
|
||||
import nu.marginalia.wmsa.edge.converting.interpreter.instruction.DocumentKeywords;
|
||||
import nu.marginalia.wmsa.edge.converting.interpreter.instruction.KeywordListChunker;
|
||||
@ -32,14 +33,9 @@ public class EdgeIndexLocalService implements EdgeIndexWriterClient {
|
||||
|
||||
@Inject
|
||||
public EdgeIndexLocalService(@Named("local-index-path") Path path) throws IOException {
|
||||
long hashMapSize = 1L << 31;
|
||||
|
||||
if (Boolean.getBoolean("small-ram")) {
|
||||
hashMapSize = 1L << 27;
|
||||
}
|
||||
|
||||
var lexiconJournal = new KeywordLexiconJournal(path.resolve("dictionary.dat").toFile());
|
||||
lexicon = new KeywordLexicon(lexiconJournal, new DictionaryHashMap(hashMapSize));
|
||||
lexicon = new KeywordLexicon(lexiconJournal, DictionaryMap.create());
|
||||
indexWriter = new SearchIndexJournalWriterImpl(lexicon, path.resolve("index.dat").toFile());
|
||||
}
|
||||
|
||||
@ -72,7 +68,7 @@ public class EdgeIndexLocalService implements EdgeIndexWriterClient {
|
||||
String word = words[i];
|
||||
|
||||
long id = lexicon.getOrInsert(word);
|
||||
if (id != DictionaryHashMap.NO_VALUE) {
|
||||
if (id != OffHeapDictionaryHashMap.NO_VALUE) {
|
||||
ids[putIdx++] = id;
|
||||
ids[putIdx++] = meta[i];
|
||||
}
|
||||
|
@ -4,7 +4,6 @@ import com.google.common.hash.HashFunction;
|
||||
import com.google.common.hash.Hashing;
|
||||
import io.prometheus.client.Gauge;
|
||||
import lombok.SneakyThrows;
|
||||
import nu.marginalia.util.dict.DictionaryHashMap;
|
||||
import nu.marginalia.util.dict.DictionaryMap;
|
||||
import nu.marginalia.wmsa.edge.index.lexicon.journal.KeywordLexiconJournal;
|
||||
import org.slf4j.Logger;
|
||||
@ -55,7 +54,7 @@ public class KeywordLexicon implements AutoCloseable {
|
||||
private int getOrInsert(byte[] bytes) {
|
||||
if (bytes.length >= Byte.MAX_VALUE) {
|
||||
logger.warn("getOrInsert({}), illegal length {}", new String(bytes), bytes.length);
|
||||
return DictionaryHashMap.NO_VALUE;
|
||||
return DictionaryMap.NO_VALUE;
|
||||
}
|
||||
|
||||
final long key = hashFunction.hashBytes(bytes).padToLong();
|
||||
|
@ -5,7 +5,7 @@ import com.google.inject.Inject;
|
||||
import com.google.inject.Singleton;
|
||||
import io.prometheus.client.Histogram;
|
||||
import nu.marginalia.util.array.buffer.LongQueryBuffer;
|
||||
import nu.marginalia.util.dict.DictionaryHashMap;
|
||||
import nu.marginalia.util.dict.OffHeapDictionaryHashMap;
|
||||
import nu.marginalia.wmsa.client.GsonFactory;
|
||||
import nu.marginalia.wmsa.edge.index.postings.SearchIndexControl;
|
||||
import nu.marginalia.wmsa.edge.index.query.IndexResultDomainDeduplicator;
|
||||
@ -101,7 +101,7 @@ public class EdgeIndexDomainQueryService {
|
||||
|
||||
private OptionalInt lookUpWord(String s) {
|
||||
int ret = indexes.getLexiconReader().get(s);
|
||||
if (ret == DictionaryHashMap.NO_VALUE) {
|
||||
if (ret == OffHeapDictionaryHashMap.NO_VALUE) {
|
||||
return OptionalInt.empty();
|
||||
}
|
||||
return OptionalInt.of(ret);
|
||||
|
@ -3,7 +3,7 @@ package nu.marginalia.wmsa.edge.index.svc;
|
||||
import com.google.inject.Inject;
|
||||
import com.google.inject.Singleton;
|
||||
import com.google.protobuf.InvalidProtocolBufferException;
|
||||
import nu.marginalia.util.dict.DictionaryHashMap;
|
||||
import nu.marginalia.util.dict.OffHeapDictionaryHashMap;
|
||||
import nu.marginalia.wmsa.edge.converting.interpreter.instruction.DocumentKeywords;
|
||||
import nu.marginalia.wmsa.edge.converting.interpreter.instruction.KeywordListChunker;
|
||||
import nu.marginalia.wmsa.edge.index.IndexServicesFactory;
|
||||
@ -51,7 +51,7 @@ public class EdgeIndexLexiconService {
|
||||
|
||||
final int wordId = lr.get(word);
|
||||
|
||||
if (DictionaryHashMap.NO_VALUE == wordId) {
|
||||
if (OffHeapDictionaryHashMap.NO_VALUE == wordId) {
|
||||
response.status(404);
|
||||
return "";
|
||||
}
|
||||
@ -110,7 +110,7 @@ public class EdgeIndexLexiconService {
|
||||
String word = words[i];
|
||||
|
||||
long id = keywordLexicon.getOrInsert(word);
|
||||
if (id != DictionaryHashMap.NO_VALUE) {
|
||||
if (id != OffHeapDictionaryHashMap.NO_VALUE) {
|
||||
ids[putIdx++] = id;
|
||||
ids[putIdx++] = meta[i];
|
||||
}
|
||||
|
@ -12,7 +12,7 @@ import io.prometheus.client.Histogram;
|
||||
import it.unimi.dsi.fastutil.ints.IntArrayList;
|
||||
import it.unimi.dsi.fastutil.ints.IntList;
|
||||
import nu.marginalia.util.array.buffer.LongQueryBuffer;
|
||||
import nu.marginalia.util.dict.DictionaryHashMap;
|
||||
import nu.marginalia.util.dict.OffHeapDictionaryHashMap;
|
||||
import nu.marginalia.wmsa.client.GsonFactory;
|
||||
import nu.marginalia.wmsa.edge.index.postings.EdgeIndexQuerySearchTerms;
|
||||
import nu.marginalia.wmsa.edge.index.postings.IndexResultValuator;
|
||||
@ -293,7 +293,7 @@ public class EdgeIndexQueryService {
|
||||
|
||||
private OptionalInt lookUpWord(String s) {
|
||||
int ret = indexes.getLexiconReader().get(s);
|
||||
if (ret == DictionaryHashMap.NO_VALUE) {
|
||||
if (ret == OffHeapDictionaryHashMap.NO_VALUE) {
|
||||
return OptionalInt.empty();
|
||||
}
|
||||
return OptionalInt.of(ret);
|
||||
|
@ -2,7 +2,7 @@ package nu.marginalia.wmsa.edge.index.postings.forward;
|
||||
|
||||
import lombok.SneakyThrows;
|
||||
import nu.marginalia.util.array.LongArray;
|
||||
import nu.marginalia.util.dict.DictionaryHashMap;
|
||||
import nu.marginalia.util.dict.OffHeapDictionaryHashMap;
|
||||
import nu.marginalia.util.test.TestUtil;
|
||||
import nu.marginalia.wmsa.edge.index.lexicon.KeywordLexicon;
|
||||
import nu.marginalia.wmsa.edge.index.lexicon.journal.KeywordLexiconJournal;
|
||||
@ -47,7 +47,7 @@ class ForwardIndexConverterTest {
|
||||
dictionaryFile = Files.createTempFile("tmp", ".dict");
|
||||
dictionaryFile.toFile().deleteOnExit();
|
||||
|
||||
keywordLexicon = new KeywordLexicon(new KeywordLexiconJournal(dictionaryFile.toFile()), new DictionaryHashMap(1L<<18));
|
||||
keywordLexicon = new KeywordLexicon(new KeywordLexiconJournal(dictionaryFile.toFile()), new OffHeapDictionaryHashMap(1L<<18));
|
||||
keywordLexicon.getOrInsert("0");
|
||||
|
||||
indexFile = Files.createTempFile("tmp", ".idx");
|
||||
|
@ -3,7 +3,7 @@ package nu.marginalia.wmsa.edge.index.postings.reverse;
|
||||
import lombok.SneakyThrows;
|
||||
import nu.marginalia.util.array.LongArray;
|
||||
import nu.marginalia.util.array.buffer.LongQueryBuffer;
|
||||
import nu.marginalia.util.dict.DictionaryHashMap;
|
||||
import nu.marginalia.util.dict.OffHeapDictionaryHashMap;
|
||||
import nu.marginalia.util.test.TestUtil;
|
||||
import nu.marginalia.wmsa.edge.index.lexicon.KeywordLexicon;
|
||||
import nu.marginalia.wmsa.edge.index.lexicon.journal.KeywordLexiconJournal;
|
||||
@ -43,7 +43,7 @@ class ReverseIndexConverterTest {
|
||||
dictionaryFile = Files.createTempFile("tmp", ".dict");
|
||||
dictionaryFile.toFile().deleteOnExit();
|
||||
|
||||
keywordLexicon = new KeywordLexicon(new KeywordLexiconJournal(dictionaryFile.toFile()), new DictionaryHashMap(1L<<16));
|
||||
keywordLexicon = new KeywordLexicon(new KeywordLexiconJournal(dictionaryFile.toFile()), new OffHeapDictionaryHashMap(1L<<16));
|
||||
keywordLexicon.getOrInsert("0");
|
||||
|
||||
indexFile = Files.createTempFile("tmp", ".idx");
|
||||
|
@ -3,7 +3,7 @@ package nu.marginalia.wmsa.edge.index.postings.reverse;
|
||||
import lombok.SneakyThrows;
|
||||
import nu.marginalia.util.array.LongArray;
|
||||
import nu.marginalia.util.array.buffer.LongQueryBuffer;
|
||||
import nu.marginalia.util.dict.DictionaryHashMap;
|
||||
import nu.marginalia.util.dict.OffHeapDictionaryHashMap;
|
||||
import nu.marginalia.util.test.TestUtil;
|
||||
import nu.marginalia.wmsa.edge.index.lexicon.KeywordLexicon;
|
||||
import nu.marginalia.wmsa.edge.index.lexicon.journal.KeywordLexiconJournal;
|
||||
@ -50,7 +50,7 @@ class ReverseIndexConverterTest2 {
|
||||
dictionaryFile = Files.createTempFile("tmp", ".dict");
|
||||
dictionaryFile.toFile().deleteOnExit();
|
||||
|
||||
keywordLexicon = new KeywordLexicon(new KeywordLexiconJournal(dictionaryFile.toFile()), new DictionaryHashMap(1L<<18));
|
||||
keywordLexicon = new KeywordLexicon(new KeywordLexiconJournal(dictionaryFile.toFile()), new OffHeapDictionaryHashMap(1L<<18));
|
||||
keywordLexicon.getOrInsert("0");
|
||||
|
||||
indexFile = Files.createTempFile("tmp", ".idx");
|
||||
|
@ -39,11 +39,10 @@ public class EdgeIndexIntegrationTestModule extends AbstractModule {
|
||||
@Override
|
||||
protected void configure() {
|
||||
|
||||
System.setProperty("small-ram", "true");
|
||||
try {
|
||||
bind(IndexServicesFactory.class).toInstance(new IndexServicesFactory(Path.of("/tmp"),
|
||||
slowDir, fastDir,
|
||||
1L<<24,
|
||||
null
|
||||
slowDir, fastDir, null
|
||||
));
|
||||
|
||||
EdgeIndexSearchSetsService setsServiceMock = Mockito.mock(EdgeIndexSearchSetsService.class);
|
||||
|
Loading…
Reference in New Issue
Block a user