(index) Don't load fwd index offsets into a hash table at start.

This makes the service take forever to start up.  Memory map the data instead and binary search.  This is a bit slower, but not by much.
This commit is contained in:
Viktor Lofgren 2024-08-06 11:16:28 +02:00
parent df6a05b9a7
commit f01267bc6b

View File

@ -1,6 +1,5 @@
package nu.marginalia.index.forward;
import gnu.trove.map.hash.TLongIntHashMap;
import nu.marginalia.array.LongArray;
import nu.marginalia.array.LongArrayFactory;
import nu.marginalia.index.forward.spans.DocumentSpans;
@ -29,7 +28,7 @@ import static nu.marginalia.index.forward.ForwardIndexParameters.*;
* The metadata is a binary encoding of {@see nu.marginalia.idx.DocumentMetadata}
*/
public class ForwardIndexReader {
private final TLongIntHashMap idToOffset;
private final LongArray ids;
private final LongArray data;
private final ForwardIndexSpansReader spansReader;
@ -41,21 +40,21 @@ public class ForwardIndexReader {
Path spansFile) throws IOException {
if (!Files.exists(dataFile)) {
logger.warn("Failed to create ForwardIndexReader, {} is absent", dataFile);
idToOffset = null;
ids = null;
data = null;
spansReader = null;
return;
}
else if (!Files.exists(idsFile)) {
logger.warn("Failed to create ForwardIndexReader, {} is absent", idsFile);
idToOffset = null;
ids = null;
data = null;
spansReader = null;
return;
}
else if (!Files.exists(spansFile)) {
logger.warn("Failed to create ForwardIndexReader, {} is absent", spansFile);
idToOffset = null;
ids = null;
data = null;
spansReader = null;
return;
@ -63,21 +62,13 @@ public class ForwardIndexReader {
logger.info("Switching forward index");
idToOffset = loadIds(idsFile);
ids = loadIds(idsFile);
data = loadData(dataFile);
spansReader = new ForwardIndexSpansReader(spansFile);
}
private static TLongIntHashMap loadIds(Path idsFile) throws IOException {
try (var idsArray = LongArrayFactory.mmapForReadingShared(idsFile)) {
assert idsArray.size() < Integer.MAX_VALUE;
var ids = new TLongIntHashMap((int) idsArray.size(), 0.5f, -1, -1);
// This hash table should be of the same size as the number of documents, so typically less than 1 Gb
idsArray.forEach(0, idsArray.size(), (pos, val) -> ids.put(val, (int) pos));
return ids;
}
private static LongArray loadIds(Path idsFile) throws IOException {
return LongArrayFactory.mmapForReadingShared(idsFile);
}
private static LongArray loadData(Path dataFile) throws IOException {
@ -115,14 +106,16 @@ public class ForwardIndexReader {
private int idxForDoc(long docId) {
assert UrlIdCodec.getRank(docId) == 0 : "Forward Index Reader fed dirty reverse index id";
long offset = ids.binarySearch(docId, 0, ids.size());
if (offset >= ids.size() || offset < 0 || ids.get(offset) != docId) {
if (getClass().desiredAssertionStatus()) {
long offset = idToOffset.get(docId);
if (offset < 0) { // Ideally we'd always check this, but this is a very hot method
logger.warn("Could not find offset for doc {}", docId);
}
return -1;
}
return idToOffset.get(docId);
return (int) offset;
}
public DocumentSpans getDocumentSpans(Arena arena, long docId) {
@ -142,7 +135,7 @@ public class ForwardIndexReader {
public int totalDocCount() {
return idToOffset.size();
return (int) ids.size();
}
public void close() {