mirror of
https://github.com/MarginaliaSearch/MarginaliaSearch.git
synced 2025-02-24 05:18:58 +00:00
(index) Don't load fwd index offsets into a hash table at start.
This makes the service take forever to start up. Memory map the data instead and binary search. This is a bit slower, but not by much.
This commit is contained in:
parent
df6a05b9a7
commit
f01267bc6b
@ -1,6 +1,5 @@
|
|||||||
package nu.marginalia.index.forward;
|
package nu.marginalia.index.forward;
|
||||||
|
|
||||||
import gnu.trove.map.hash.TLongIntHashMap;
|
|
||||||
import nu.marginalia.array.LongArray;
|
import nu.marginalia.array.LongArray;
|
||||||
import nu.marginalia.array.LongArrayFactory;
|
import nu.marginalia.array.LongArrayFactory;
|
||||||
import nu.marginalia.index.forward.spans.DocumentSpans;
|
import nu.marginalia.index.forward.spans.DocumentSpans;
|
||||||
@ -29,7 +28,7 @@ import static nu.marginalia.index.forward.ForwardIndexParameters.*;
|
|||||||
* The metadata is a binary encoding of {@see nu.marginalia.idx.DocumentMetadata}
|
* The metadata is a binary encoding of {@see nu.marginalia.idx.DocumentMetadata}
|
||||||
*/
|
*/
|
||||||
public class ForwardIndexReader {
|
public class ForwardIndexReader {
|
||||||
private final TLongIntHashMap idToOffset;
|
private final LongArray ids;
|
||||||
private final LongArray data;
|
private final LongArray data;
|
||||||
|
|
||||||
private final ForwardIndexSpansReader spansReader;
|
private final ForwardIndexSpansReader spansReader;
|
||||||
@ -41,21 +40,21 @@ public class ForwardIndexReader {
|
|||||||
Path spansFile) throws IOException {
|
Path spansFile) throws IOException {
|
||||||
if (!Files.exists(dataFile)) {
|
if (!Files.exists(dataFile)) {
|
||||||
logger.warn("Failed to create ForwardIndexReader, {} is absent", dataFile);
|
logger.warn("Failed to create ForwardIndexReader, {} is absent", dataFile);
|
||||||
idToOffset = null;
|
ids = null;
|
||||||
data = null;
|
data = null;
|
||||||
spansReader = null;
|
spansReader = null;
|
||||||
return;
|
return;
|
||||||
}
|
}
|
||||||
else if (!Files.exists(idsFile)) {
|
else if (!Files.exists(idsFile)) {
|
||||||
logger.warn("Failed to create ForwardIndexReader, {} is absent", idsFile);
|
logger.warn("Failed to create ForwardIndexReader, {} is absent", idsFile);
|
||||||
idToOffset = null;
|
ids = null;
|
||||||
data = null;
|
data = null;
|
||||||
spansReader = null;
|
spansReader = null;
|
||||||
return;
|
return;
|
||||||
}
|
}
|
||||||
else if (!Files.exists(spansFile)) {
|
else if (!Files.exists(spansFile)) {
|
||||||
logger.warn("Failed to create ForwardIndexReader, {} is absent", spansFile);
|
logger.warn("Failed to create ForwardIndexReader, {} is absent", spansFile);
|
||||||
idToOffset = null;
|
ids = null;
|
||||||
data = null;
|
data = null;
|
||||||
spansReader = null;
|
spansReader = null;
|
||||||
return;
|
return;
|
||||||
@ -63,21 +62,13 @@ public class ForwardIndexReader {
|
|||||||
|
|
||||||
logger.info("Switching forward index");
|
logger.info("Switching forward index");
|
||||||
|
|
||||||
idToOffset = loadIds(idsFile);
|
ids = loadIds(idsFile);
|
||||||
data = loadData(dataFile);
|
data = loadData(dataFile);
|
||||||
spansReader = new ForwardIndexSpansReader(spansFile);
|
spansReader = new ForwardIndexSpansReader(spansFile);
|
||||||
}
|
}
|
||||||
|
|
||||||
private static TLongIntHashMap loadIds(Path idsFile) throws IOException {
|
private static LongArray loadIds(Path idsFile) throws IOException {
|
||||||
try (var idsArray = LongArrayFactory.mmapForReadingShared(idsFile)) {
|
return LongArrayFactory.mmapForReadingShared(idsFile);
|
||||||
assert idsArray.size() < Integer.MAX_VALUE;
|
|
||||||
|
|
||||||
var ids = new TLongIntHashMap((int) idsArray.size(), 0.5f, -1, -1);
|
|
||||||
// This hash table should be of the same size as the number of documents, so typically less than 1 Gb
|
|
||||||
idsArray.forEach(0, idsArray.size(), (pos, val) -> ids.put(val, (int) pos));
|
|
||||||
|
|
||||||
return ids;
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
|
|
||||||
private static LongArray loadData(Path dataFile) throws IOException {
|
private static LongArray loadData(Path dataFile) throws IOException {
|
||||||
@ -115,14 +106,16 @@ public class ForwardIndexReader {
|
|||||||
private int idxForDoc(long docId) {
|
private int idxForDoc(long docId) {
|
||||||
assert UrlIdCodec.getRank(docId) == 0 : "Forward Index Reader fed dirty reverse index id";
|
assert UrlIdCodec.getRank(docId) == 0 : "Forward Index Reader fed dirty reverse index id";
|
||||||
|
|
||||||
if (getClass().desiredAssertionStatus()) {
|
long offset = ids.binarySearch(docId, 0, ids.size());
|
||||||
long offset = idToOffset.get(docId);
|
|
||||||
if (offset < 0) { // Ideally we'd always check this, but this is a very hot method
|
if (offset >= ids.size() || offset < 0 || ids.get(offset) != docId) {
|
||||||
|
if (getClass().desiredAssertionStatus()) {
|
||||||
logger.warn("Could not find offset for doc {}", docId);
|
logger.warn("Could not find offset for doc {}", docId);
|
||||||
}
|
}
|
||||||
|
return -1;
|
||||||
}
|
}
|
||||||
|
|
||||||
return idToOffset.get(docId);
|
return (int) offset;
|
||||||
}
|
}
|
||||||
|
|
||||||
public DocumentSpans getDocumentSpans(Arena arena, long docId) {
|
public DocumentSpans getDocumentSpans(Arena arena, long docId) {
|
||||||
@ -142,7 +135,7 @@ public class ForwardIndexReader {
|
|||||||
|
|
||||||
|
|
||||||
public int totalDocCount() {
|
public int totalDocCount() {
|
||||||
return idToOffset.size();
|
return (int) ids.size();
|
||||||
}
|
}
|
||||||
|
|
||||||
public void close() {
|
public void close() {
|
||||||
|
Loading…
Reference in New Issue
Block a user