(index) Don't load fwd index offsets into a hash table at start.

This makes the service take forever to start up. Memory map the data instead and binary search. This is a bit slower, but not by much.
2025-02-23 21:18:58 +00:00 · 2024-08-06 11:16:28 +02:00 · 2024-08-06 11:16:28 +02:00 · f01267bc6b
commit f01267bc6b
parent df6a05b9a7
1 changed files with 14 additions and 21 deletions
--- a/code/index/index-forward/java/nu/marginalia/index/forward/ForwardIndexReader.java
+++ b/code/index/index-forward/java/nu/marginalia/index/forward/ForwardIndexReader.java
@ -1,6 +1,5 @@
 package nu.marginalia.index.forward;

-import gnu.trove.map.hash.TLongIntHashMap;
 import nu.marginalia.array.LongArray;
 import nu.marginalia.array.LongArrayFactory;
 import nu.marginalia.index.forward.spans.DocumentSpans;
@ -29,7 +28,7 @@ import static nu.marginalia.index.forward.ForwardIndexParameters.*;
 * The metadata is a binary encoding of {@see nu.marginalia.idx.DocumentMetadata}
 */
 public class ForwardIndexReader {
-    private final TLongIntHashMap idToOffset;
+    private final LongArray ids;
    private final LongArray data;

    private final ForwardIndexSpansReader spansReader;
@ -41,21 +40,21 @@ public class ForwardIndexReader {
                              Path spansFile) throws IOException {
        if (!Files.exists(dataFile)) {
            logger.warn("Failed to create ForwardIndexReader, {} is absent", dataFile);
-            idToOffset = null;
+            ids = null;
            data = null;
            spansReader = null;
            return;
        }
        else if (!Files.exists(idsFile)) {
            logger.warn("Failed to create ForwardIndexReader, {} is absent", idsFile);
-            idToOffset = null;
+            ids = null;
            data = null;
            spansReader = null;
            return;
        }
        else if (!Files.exists(spansFile)) {
            logger.warn("Failed to create ForwardIndexReader, {} is absent", spansFile);
-            idToOffset = null;
+            ids = null;
            data = null;
            spansReader = null;
            return;
@ -63,21 +62,13 @@ public class ForwardIndexReader {

        logger.info("Switching forward index");

-        idToOffset = loadIds(idsFile);
+        ids = loadIds(idsFile);
        data = loadData(dataFile);
        spansReader = new ForwardIndexSpansReader(spansFile);
    }

-    private static TLongIntHashMap loadIds(Path idsFile) throws IOException {
-        try (var idsArray = LongArrayFactory.mmapForReadingShared(idsFile)) {
-            assert idsArray.size() < Integer.MAX_VALUE;
-
-            var ids = new TLongIntHashMap((int) idsArray.size(), 0.5f, -1, -1);
-            // This hash table should be of the same size as the number of documents, so typically less than 1 Gb
-            idsArray.forEach(0, idsArray.size(), (pos, val) -> ids.put(val, (int) pos));
-
-            return ids;
-        }
+    private static LongArray loadIds(Path idsFile) throws IOException {
+        return LongArrayFactory.mmapForReadingShared(idsFile);
    }

    private static LongArray loadData(Path dataFile) throws IOException {
@ -115,14 +106,16 @@ public class ForwardIndexReader {
    private int idxForDoc(long docId) {
        assert UrlIdCodec.getRank(docId) == 0 : "Forward Index Reader fed dirty reverse index id";

-        if (getClass().desiredAssertionStatus()) {
-            long offset = idToOffset.get(docId);
-            if (offset < 0) { // Ideally we'd always check this, but this is a very hot method
+        long offset = ids.binarySearch(docId, 0, ids.size());
+
+        if (offset >= ids.size() || offset < 0 || ids.get(offset) != docId) {
+            if (getClass().desiredAssertionStatus()) {
                logger.warn("Could not find offset for doc {}", docId);
            }
+            return -1;
        }

-        return idToOffset.get(docId);
+        return (int) offset;
    }

    public DocumentSpans getDocumentSpans(Arena arena, long docId) {
@ -142,7 +135,7 @@ public class ForwardIndexReader {


    public int totalDocCount() {
-        return idToOffset.size();
+        return (int) ids.size();
    }

    public void close() {