mirror of
https://github.com/MarginaliaSearch/MarginaliaSearch.git
synced 2025-02-23 21:18:58 +00:00
(index) Don't load fwd index offsets into a hash table at start.
This makes the service take forever to start up. Memory map the data instead and binary search. This is a bit slower, but not by much.
This commit is contained in:
parent
df6a05b9a7
commit
f01267bc6b
@ -1,6 +1,5 @@
|
||||
package nu.marginalia.index.forward;
|
||||
|
||||
import gnu.trove.map.hash.TLongIntHashMap;
|
||||
import nu.marginalia.array.LongArray;
|
||||
import nu.marginalia.array.LongArrayFactory;
|
||||
import nu.marginalia.index.forward.spans.DocumentSpans;
|
||||
@ -29,7 +28,7 @@ import static nu.marginalia.index.forward.ForwardIndexParameters.*;
|
||||
* The metadata is a binary encoding of {@see nu.marginalia.idx.DocumentMetadata}
|
||||
*/
|
||||
public class ForwardIndexReader {
|
||||
private final TLongIntHashMap idToOffset;
|
||||
private final LongArray ids;
|
||||
private final LongArray data;
|
||||
|
||||
private final ForwardIndexSpansReader spansReader;
|
||||
@ -41,21 +40,21 @@ public class ForwardIndexReader {
|
||||
Path spansFile) throws IOException {
|
||||
if (!Files.exists(dataFile)) {
|
||||
logger.warn("Failed to create ForwardIndexReader, {} is absent", dataFile);
|
||||
idToOffset = null;
|
||||
ids = null;
|
||||
data = null;
|
||||
spansReader = null;
|
||||
return;
|
||||
}
|
||||
else if (!Files.exists(idsFile)) {
|
||||
logger.warn("Failed to create ForwardIndexReader, {} is absent", idsFile);
|
||||
idToOffset = null;
|
||||
ids = null;
|
||||
data = null;
|
||||
spansReader = null;
|
||||
return;
|
||||
}
|
||||
else if (!Files.exists(spansFile)) {
|
||||
logger.warn("Failed to create ForwardIndexReader, {} is absent", spansFile);
|
||||
idToOffset = null;
|
||||
ids = null;
|
||||
data = null;
|
||||
spansReader = null;
|
||||
return;
|
||||
@ -63,21 +62,13 @@ public class ForwardIndexReader {
|
||||
|
||||
logger.info("Switching forward index");
|
||||
|
||||
idToOffset = loadIds(idsFile);
|
||||
ids = loadIds(idsFile);
|
||||
data = loadData(dataFile);
|
||||
spansReader = new ForwardIndexSpansReader(spansFile);
|
||||
}
|
||||
|
||||
private static TLongIntHashMap loadIds(Path idsFile) throws IOException {
|
||||
try (var idsArray = LongArrayFactory.mmapForReadingShared(idsFile)) {
|
||||
assert idsArray.size() < Integer.MAX_VALUE;
|
||||
|
||||
var ids = new TLongIntHashMap((int) idsArray.size(), 0.5f, -1, -1);
|
||||
// This hash table should be of the same size as the number of documents, so typically less than 1 Gb
|
||||
idsArray.forEach(0, idsArray.size(), (pos, val) -> ids.put(val, (int) pos));
|
||||
|
||||
return ids;
|
||||
}
|
||||
private static LongArray loadIds(Path idsFile) throws IOException {
|
||||
return LongArrayFactory.mmapForReadingShared(idsFile);
|
||||
}
|
||||
|
||||
private static LongArray loadData(Path dataFile) throws IOException {
|
||||
@ -115,14 +106,16 @@ public class ForwardIndexReader {
|
||||
private int idxForDoc(long docId) {
|
||||
assert UrlIdCodec.getRank(docId) == 0 : "Forward Index Reader fed dirty reverse index id";
|
||||
|
||||
if (getClass().desiredAssertionStatus()) {
|
||||
long offset = idToOffset.get(docId);
|
||||
if (offset < 0) { // Ideally we'd always check this, but this is a very hot method
|
||||
long offset = ids.binarySearch(docId, 0, ids.size());
|
||||
|
||||
if (offset >= ids.size() || offset < 0 || ids.get(offset) != docId) {
|
||||
if (getClass().desiredAssertionStatus()) {
|
||||
logger.warn("Could not find offset for doc {}", docId);
|
||||
}
|
||||
return -1;
|
||||
}
|
||||
|
||||
return idToOffset.get(docId);
|
||||
return (int) offset;
|
||||
}
|
||||
|
||||
public DocumentSpans getDocumentSpans(Arena arena, long docId) {
|
||||
@ -142,7 +135,7 @@ public class ForwardIndexReader {
|
||||
|
||||
|
||||
public int totalDocCount() {
|
||||
return idToOffset.size();
|
||||
return (int) ids.size();
|
||||
}
|
||||
|
||||
public void close() {
|
||||
|
Loading…
Reference in New Issue
Block a user