(index) Don't load fwd index offsets into a hash table at start.

This makes the service take forever to start up.  Memory map the data instead and binary search.  This is a bit slower, but not by much.
This commit is contained in:
Viktor Lofgren 2024-08-06 11:16:28 +02:00
parent df6a05b9a7
commit f01267bc6b

View File

@ -1,6 +1,5 @@
package nu.marginalia.index.forward; package nu.marginalia.index.forward;
import gnu.trove.map.hash.TLongIntHashMap;
import nu.marginalia.array.LongArray; import nu.marginalia.array.LongArray;
import nu.marginalia.array.LongArrayFactory; import nu.marginalia.array.LongArrayFactory;
import nu.marginalia.index.forward.spans.DocumentSpans; import nu.marginalia.index.forward.spans.DocumentSpans;
@ -29,7 +28,7 @@ import static nu.marginalia.index.forward.ForwardIndexParameters.*;
* The metadata is a binary encoding of {@see nu.marginalia.idx.DocumentMetadata} * The metadata is a binary encoding of {@see nu.marginalia.idx.DocumentMetadata}
*/ */
public class ForwardIndexReader { public class ForwardIndexReader {
private final TLongIntHashMap idToOffset; private final LongArray ids;
private final LongArray data; private final LongArray data;
private final ForwardIndexSpansReader spansReader; private final ForwardIndexSpansReader spansReader;
@ -41,21 +40,21 @@ public class ForwardIndexReader {
Path spansFile) throws IOException { Path spansFile) throws IOException {
if (!Files.exists(dataFile)) { if (!Files.exists(dataFile)) {
logger.warn("Failed to create ForwardIndexReader, {} is absent", dataFile); logger.warn("Failed to create ForwardIndexReader, {} is absent", dataFile);
idToOffset = null; ids = null;
data = null; data = null;
spansReader = null; spansReader = null;
return; return;
} }
else if (!Files.exists(idsFile)) { else if (!Files.exists(idsFile)) {
logger.warn("Failed to create ForwardIndexReader, {} is absent", idsFile); logger.warn("Failed to create ForwardIndexReader, {} is absent", idsFile);
idToOffset = null; ids = null;
data = null; data = null;
spansReader = null; spansReader = null;
return; return;
} }
else if (!Files.exists(spansFile)) { else if (!Files.exists(spansFile)) {
logger.warn("Failed to create ForwardIndexReader, {} is absent", spansFile); logger.warn("Failed to create ForwardIndexReader, {} is absent", spansFile);
idToOffset = null; ids = null;
data = null; data = null;
spansReader = null; spansReader = null;
return; return;
@ -63,21 +62,13 @@ public class ForwardIndexReader {
logger.info("Switching forward index"); logger.info("Switching forward index");
idToOffset = loadIds(idsFile); ids = loadIds(idsFile);
data = loadData(dataFile); data = loadData(dataFile);
spansReader = new ForwardIndexSpansReader(spansFile); spansReader = new ForwardIndexSpansReader(spansFile);
} }
private static TLongIntHashMap loadIds(Path idsFile) throws IOException { private static LongArray loadIds(Path idsFile) throws IOException {
try (var idsArray = LongArrayFactory.mmapForReadingShared(idsFile)) { return LongArrayFactory.mmapForReadingShared(idsFile);
assert idsArray.size() < Integer.MAX_VALUE;
var ids = new TLongIntHashMap((int) idsArray.size(), 0.5f, -1, -1);
// This hash table should be of the same size as the number of documents, so typically less than 1 Gb
idsArray.forEach(0, idsArray.size(), (pos, val) -> ids.put(val, (int) pos));
return ids;
}
} }
private static LongArray loadData(Path dataFile) throws IOException { private static LongArray loadData(Path dataFile) throws IOException {
@ -115,14 +106,16 @@ public class ForwardIndexReader {
private int idxForDoc(long docId) { private int idxForDoc(long docId) {
assert UrlIdCodec.getRank(docId) == 0 : "Forward Index Reader fed dirty reverse index id"; assert UrlIdCodec.getRank(docId) == 0 : "Forward Index Reader fed dirty reverse index id";
long offset = ids.binarySearch(docId, 0, ids.size());
if (offset >= ids.size() || offset < 0 || ids.get(offset) != docId) {
if (getClass().desiredAssertionStatus()) { if (getClass().desiredAssertionStatus()) {
long offset = idToOffset.get(docId);
if (offset < 0) { // Ideally we'd always check this, but this is a very hot method
logger.warn("Could not find offset for doc {}", docId); logger.warn("Could not find offset for doc {}", docId);
} }
return -1;
} }
return idToOffset.get(docId); return (int) offset;
} }
public DocumentSpans getDocumentSpans(Arena arena, long docId) { public DocumentSpans getDocumentSpans(Arena arena, long docId) {
@ -142,7 +135,7 @@ public class ForwardIndexReader {
public int totalDocCount() { public int totalDocCount() {
return idToOffset.size(); return (int) ids.size();
} }
public void close() { public void close() {