mirror of
https://github.com/MarginaliaSearch/MarginaliaSearch.git
synced 2025-02-23 13:09:00 +00:00
Refactoring BTreeReader and binary search code
This commit is contained in:
parent
f76af4ca79
commit
420b9bb7e0
@ -11,94 +11,68 @@ public class BTreeReader {
|
||||
|
||||
private final MultimapFileLong file;
|
||||
private final BTreeContext ctx;
|
||||
|
||||
private final Logger logger = LoggerFactory.getLogger(BTreeReader.class);
|
||||
private final long mask;
|
||||
private final MultimapSearcher searcher;
|
||||
|
||||
private final MultimapSearcher indexSearcher;
|
||||
private final MultimapSearcher dataSearcher;
|
||||
|
||||
public BTreeReader(MultimapFileLong file, BTreeContext ctx) {
|
||||
this.file = file;
|
||||
this.searcher = file.createSearcher();
|
||||
this.indexSearcher = MultimapSearcher.forContext(file, ~0, 1);
|
||||
this.dataSearcher = MultimapSearcher.forContext(file, ctx.equalityMask(), ctx.entrySize());
|
||||
|
||||
this.ctx = ctx;
|
||||
this.mask = ctx.equalityMask();
|
||||
}
|
||||
|
||||
public long fileSize() {
|
||||
return file.size();
|
||||
public BTreeHeader getHeader(long fileOffset) {
|
||||
return new BTreeHeader(file.get(fileOffset), file.get(fileOffset+1), file.get(fileOffset+2));
|
||||
}
|
||||
|
||||
public BTreeHeader getHeader(long offset) {
|
||||
return new BTreeHeader(file.get(offset), file.get(offset+1), file.get(offset+2));
|
||||
}
|
||||
/**
|
||||
*
|
||||
* @return file offset of entry matching keyRaw, negative if absent
|
||||
*/
|
||||
public long findEntry(BTreeHeader header, final long keyRaw) {
|
||||
final long key = keyRaw & ctx.equalityMask();
|
||||
|
||||
public long offsetForEntry(BTreeHeader header, final long keyRaw) {
|
||||
final long key = keyRaw & mask;
|
||||
final long dataAddress = header.dataOffsetLongs();
|
||||
final int entrySize = ctx.entrySize();
|
||||
final int blockSize = ctx.BLOCK_SIZE_WORDS();
|
||||
|
||||
if (header.layers() == 0) {
|
||||
return trivialSearch(header, key);
|
||||
if (header.layers() == 0) { // For small data, we only have a data block
|
||||
return dataSearcher.binarySearchUpperBound(key, dataAddress, header.numEntries());
|
||||
}
|
||||
|
||||
long p = searchEntireTopLayer(header, key);
|
||||
if (p < 0) return -1;
|
||||
final long indexOffset = header.indexOffsetLongs();
|
||||
|
||||
long cumOffset = p * ctx.BLOCK_SIZE_WORDS();
|
||||
// Search the top layer
|
||||
long layerOffset = indexSearch(key, indexOffset, blockSize);
|
||||
if (layerOffset < 0) return -1;
|
||||
|
||||
// Search intermediary layers
|
||||
for (int i = header.layers() - 2; i >= 0; --i) {
|
||||
long offsetBase = header.indexOffsetLongs() + header.relativeLayerOffset(ctx, i);
|
||||
p = searchLayerBlock(key, offsetBase+cumOffset);
|
||||
if (p < 0)
|
||||
final long layerAddressBase = indexOffset + header.relativeIndexLayerOffset(ctx, i);
|
||||
final long layerBlockOffset = layerAddressBase + layerOffset;
|
||||
|
||||
final long nextLayerOffset = indexSearch(key, layerBlockOffset, blockSize);
|
||||
if (nextLayerOffset < 0)
|
||||
return -1;
|
||||
cumOffset = ctx.BLOCK_SIZE_WORDS()*(p + cumOffset);
|
||||
|
||||
layerOffset = blockSize*(nextLayerOffset + layerOffset);
|
||||
}
|
||||
|
||||
long dataMax = header.dataOffsetLongs() + (long) header.numEntries() * ctx.entrySize();
|
||||
return searchDataBlock(key,
|
||||
header.dataOffsetLongs() + ctx.entrySize()*cumOffset,
|
||||
dataMax);
|
||||
// Search the corresponding data block
|
||||
final long searchStart = dataAddress + layerOffset * entrySize;
|
||||
final long lastDataAddress = dataAddress + (long) header.numEntries() * entrySize;
|
||||
final long lastItemInBlockAddress = searchStart + (long) blockSize * entrySize;
|
||||
final long searchEnd = Math.min(lastItemInBlockAddress, lastDataAddress);
|
||||
|
||||
return dataSearcher.binarySearchUpperBound(key, searchStart, (searchEnd - searchStart) / entrySize);
|
||||
}
|
||||
|
||||
|
||||
private long searchEntireTopLayer(BTreeHeader header, long key) {
|
||||
long offset = header.indexOffsetLongs();
|
||||
|
||||
return searcher.binarySearchUpperBound(key, offset, offset + ctx.BLOCK_SIZE_WORDS()) - offset;
|
||||
}
|
||||
|
||||
private long searchLayerBlock(long key, long blockOffset) {
|
||||
if (blockOffset < 0)
|
||||
return blockOffset;
|
||||
|
||||
return searcher.binarySearchUpperBound(key, blockOffset, blockOffset + ctx.BLOCK_SIZE_WORDS()) - blockOffset;
|
||||
}
|
||||
|
||||
|
||||
private long searchDataBlock(long key, long blockOffset, long dataMax) {
|
||||
if (blockOffset < 0)
|
||||
return blockOffset;
|
||||
|
||||
long lastOffset = Math.min(blockOffset+ctx.BLOCK_SIZE_WORDS()*(long)ctx.entrySize(), dataMax);
|
||||
int length = (int)(lastOffset - blockOffset);
|
||||
|
||||
if (ctx.entrySize() == 1) {
|
||||
if (mask == ~0L) return searcher.binarySearchUpperBoundNoMiss(key, blockOffset, blockOffset+length);
|
||||
return searcher.binarySearchUpperBoundNoMiss(key, blockOffset, blockOffset+length, mask);
|
||||
}
|
||||
|
||||
return searcher.binarySearchUpperBoundNoMiss(key, blockOffset, ctx.entrySize(), length/ctx.entrySize(), mask);
|
||||
}
|
||||
|
||||
private long trivialSearch(BTreeHeader header, long key) {
|
||||
long offset = header.dataOffsetLongs();
|
||||
|
||||
if (ctx.entrySize() == 1) {
|
||||
if (mask == ~0L) {
|
||||
return searcher.binarySearchUpperBoundNoMiss(key, offset, offset+header.numEntries());
|
||||
}
|
||||
else {
|
||||
return searcher.binarySearchUpperBoundNoMiss(key, offset, offset+header.numEntries(), mask);
|
||||
}
|
||||
}
|
||||
|
||||
return searcher.binarySearchUpperBoundNoMiss(key, offset, ctx.entrySize(), header.numEntries(), mask);
|
||||
|
||||
private long indexSearch(long key, long start, long n) {
|
||||
return indexSearcher.binarySearch(key, start, n) - start;
|
||||
}
|
||||
|
||||
}
|
||||
|
@ -2,16 +2,12 @@ package nu.marginalia.util.btree;
|
||||
|
||||
import nu.marginalia.util.btree.model.BTreeContext;
|
||||
import nu.marginalia.util.btree.model.BTreeHeader;
|
||||
import nu.marginalia.util.multimap.MultimapFileLong;
|
||||
import nu.marginalia.util.multimap.MultimapFileLongSlice;
|
||||
import org.slf4j.Logger;
|
||||
import org.slf4j.LoggerFactory;
|
||||
|
||||
import java.io.IOException;
|
||||
|
||||
|
||||
public class BTreeWriter {
|
||||
private final Logger logger = LoggerFactory.getLogger(BTreeWriter.class);
|
||||
private final BTreeContext ctx;
|
||||
private final MultimapFileLongSlice map;
|
||||
|
||||
@ -27,7 +23,7 @@ public class BTreeWriter {
|
||||
|
||||
long size = 0;
|
||||
for (int layer = 0; layer < numLayers; layer++) {
|
||||
size += ctx.layerSize(numWords, layer);
|
||||
size += ctx.indexLayerSize(numWords, layer);
|
||||
}
|
||||
return size;
|
||||
}
|
||||
@ -45,17 +41,17 @@ public class BTreeWriter {
|
||||
|
||||
writeIndexCallback.write(map.atOffset(header.dataOffsetLongs()));
|
||||
|
||||
if (header.layers() < 1) {
|
||||
if (header.layers() < 1) { // The data is too small to benefit from indexing
|
||||
return ctx.calculateSize(numEntries);
|
||||
}
|
||||
else {
|
||||
writeIndex(header);
|
||||
return ctx.calculateSize(numEntries);
|
||||
}
|
||||
|
||||
writeIndex(header);
|
||||
|
||||
return ctx.calculateSize(numEntries);
|
||||
}
|
||||
|
||||
public static BTreeHeader makeHeader(BTreeContext ctx, long offset, int numEntries) {
|
||||
final int numLayers = ctx.numLayers(numEntries);
|
||||
final int numLayers = ctx.numIndexLayers(numEntries);
|
||||
|
||||
final int padding = BTreeHeader.getPadding(ctx, offset, numLayers);
|
||||
|
||||
@ -71,46 +67,50 @@ public class BTreeWriter {
|
||||
|
||||
|
||||
private void writeIndex(BTreeHeader header) {
|
||||
var layerOffsets = getRelativeLayerOffsets(header);
|
||||
var layerOffsets = header.getRelativeLayerOffsets(ctx);
|
||||
|
||||
long stride = ctx.BLOCK_SIZE_WORDS();
|
||||
long indexedDataStepSize = ctx.BLOCK_SIZE_WORDS();
|
||||
|
||||
/* Index layer 0 indexes the data itself
|
||||
Index layer 1 indexes layer 0
|
||||
Index layer 2 indexes layer 1
|
||||
And so on
|
||||
*/
|
||||
for (int layer = 0; layer < header.layers(); layer++,
|
||||
stride*=ctx.BLOCK_SIZE_WORDS()) {
|
||||
long indexWord = 0;
|
||||
long offsetBase = layerOffsets[layer] + header.indexOffsetLongs();
|
||||
long numEntries = header.numEntries();
|
||||
for (long idx = 0; idx < numEntries; idx += stride, indexWord++) {
|
||||
long dataOffset = header.dataOffsetLongs() + (idx + (stride-1)) * ctx.entrySize();
|
||||
long val;
|
||||
indexedDataStepSize*=ctx.BLOCK_SIZE_WORDS()) {
|
||||
|
||||
if (idx + (stride-1) < numEntries) {
|
||||
val = map.get(dataOffset) & ctx.equalityMask();
|
||||
}
|
||||
else {
|
||||
val = Long.MAX_VALUE;
|
||||
}
|
||||
if (offsetBase + indexWord < 0) {
|
||||
logger.error("bad put @ {}", offsetBase + indexWord);
|
||||
logger.error("layer{}", layer);
|
||||
logger.error("layer offsets {}", layerOffsets);
|
||||
logger.error("offsetBase = {}", offsetBase);
|
||||
logger.error("numEntries = {}", numEntries);
|
||||
logger.error("indexWord = {}", indexWord);
|
||||
}
|
||||
map.put(offsetBase + indexWord, val);
|
||||
}
|
||||
for (; (indexWord % ctx.BLOCK_SIZE_WORDS()) != 0; indexWord++) {
|
||||
map.put(offsetBase + indexWord, Long.MAX_VALUE);
|
||||
}
|
||||
writeIndexLayer(header, layerOffsets, indexedDataStepSize, layer);
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
private long[] getRelativeLayerOffsets(BTreeHeader header) {
|
||||
long[] layerOffsets = new long[header.layers()];
|
||||
for (int i = 0; i < header.layers(); i++) {
|
||||
layerOffsets[i] = header.relativeLayerOffset(ctx, i);
|
||||
private void writeIndexLayer(BTreeHeader header, long[] layerOffsets,
|
||||
final long indexedDataStepSize,
|
||||
final int layer) {
|
||||
|
||||
final long indexOffsetBase = layerOffsets[layer] + header.indexOffsetLongs();
|
||||
final long dataOffsetBase = header.dataOffsetLongs();
|
||||
|
||||
final long dataEntriesMax = header.numEntries();
|
||||
final int entrySize = ctx.entrySize();
|
||||
|
||||
final long lastDataEntryOffset = indexedDataStepSize - 1;
|
||||
|
||||
long indexWord = 0;
|
||||
|
||||
for (long dataPtr = 0;
|
||||
dataPtr + lastDataEntryOffset < dataEntriesMax;
|
||||
dataPtr += indexedDataStepSize)
|
||||
{
|
||||
long dataOffset = dataOffsetBase + (dataPtr + lastDataEntryOffset) * entrySize;
|
||||
map.put(indexOffsetBase + indexWord++, map.get(dataOffset) & ctx.equalityMask());
|
||||
}
|
||||
return layerOffsets;
|
||||
|
||||
// Fill the remaining block with LONG_MAX
|
||||
map.setRange(indexOffsetBase+indexWord,
|
||||
(int) (ctx.BLOCK_SIZE_WORDS() - (indexWord % ctx.BLOCK_SIZE_WORDS())),
|
||||
Long.MAX_VALUE);
|
||||
}
|
||||
|
||||
|
||||
}
|
||||
|
@ -10,7 +10,6 @@ public record BTreeContext(int MAX_LAYERS,
|
||||
|
||||
public BTreeContext(int MAX_LAYERS, int entrySize, long equalityMask, int BLOCK_SIZE_BITS) {
|
||||
this(MAX_LAYERS, entrySize, equalityMask, BLOCK_SIZE_BITS, 1 << BLOCK_SIZE_BITS);
|
||||
|
||||
}
|
||||
|
||||
public long calculateSize(int numEntries) {
|
||||
@ -19,7 +18,7 @@ public record BTreeContext(int MAX_LAYERS,
|
||||
return header.dataOffsetLongs() + (long)numEntries * entrySize;
|
||||
}
|
||||
|
||||
public int numLayers(int numEntries) {
|
||||
public int numIndexLayers(int numEntries) {
|
||||
if (numEntries <= BLOCK_SIZE_WORDS*2) {
|
||||
return 0;
|
||||
}
|
||||
@ -36,11 +35,7 @@ public record BTreeContext(int MAX_LAYERS,
|
||||
return MAX_LAYERS;
|
||||
}
|
||||
|
||||
public long layerSize(int numEntries, int level) {
|
||||
return BLOCK_SIZE_WORDS * numBlocks(numEntries, level);
|
||||
}
|
||||
|
||||
private long numBlocks(int numWords, int level) {
|
||||
public long indexLayerSize(int numWords, int level) {
|
||||
|
||||
long layerSize = 1L<<(BLOCK_SIZE_BITS*(level+1));
|
||||
int numBlocks = 0;
|
||||
@ -50,7 +45,7 @@ public record BTreeContext(int MAX_LAYERS,
|
||||
numBlocks++;
|
||||
}
|
||||
|
||||
return numBlocks;
|
||||
return (long) BLOCK_SIZE_WORDS * numBlocks;
|
||||
}
|
||||
|
||||
}
|
||||
|
@ -1,6 +1,5 @@
|
||||
package nu.marginalia.util.btree.model;
|
||||
|
||||
import nu.marginalia.util.multimap.MultimapFileLong;
|
||||
import nu.marginalia.util.multimap.MultimapFileLongSlice;
|
||||
|
||||
public record BTreeHeader(int layers, int numEntries, long indexOffsetLongs, long dataOffsetLongs) {
|
||||
@ -36,12 +35,20 @@ public record BTreeHeader(int layers, int numEntries, long indexOffsetLongs, lon
|
||||
}
|
||||
|
||||
|
||||
public long relativeLayerOffset(BTreeContext ctx, int n) {
|
||||
public long relativeIndexLayerOffset(BTreeContext ctx, int n) {
|
||||
long offset = 0;
|
||||
for (int i = n+1; i < layers; i++) {
|
||||
offset += ctx.layerSize( numEntries, i);
|
||||
offset += ctx.indexLayerSize( numEntries, i);
|
||||
}
|
||||
return offset;
|
||||
}
|
||||
|
||||
public long[] getRelativeLayerOffsets(BTreeContext ctx) {
|
||||
long[] layerOffsets = new long[layers()];
|
||||
for (int i = 0; i < layers(); i++) {
|
||||
layerOffsets[i] = relativeIndexLayerOffset(ctx, i);
|
||||
}
|
||||
return layerOffsets;
|
||||
}
|
||||
|
||||
}
|
||||
|
@ -97,8 +97,8 @@ public class MultimapFileLong implements AutoCloseable, MultimapFileLongSlice {
|
||||
readableSize(mapSizeBytes), readableSize(8L*bufferSizeWords), mode);
|
||||
}
|
||||
|
||||
public MultimapSearcher createSearcher() {
|
||||
return new MultimapSearcher(this);
|
||||
public MultimapSearcherBase createSearcher() {
|
||||
return new MultimapSearcherBase(this);
|
||||
}
|
||||
public MultimapSorter createSorter(Path tmpFile, int internalSortLimit) {
|
||||
return new MultimapSorter(this, tmpFile, internalSortLimit);
|
||||
@ -332,6 +332,34 @@ public class MultimapFileLong implements AutoCloseable, MultimapFileLongSlice {
|
||||
|
||||
}
|
||||
|
||||
@Override
|
||||
public void setRange(long idx, int n, long val) {
|
||||
if (n == 0) return;
|
||||
|
||||
if (idx+n >= mappedSize) {
|
||||
grow(idx+n);
|
||||
}
|
||||
int iN = (int)((idx + n) / bufferSize);
|
||||
|
||||
for (int i = 0; i < n; ) {
|
||||
int i0 = (int)((idx + i) / bufferSize);
|
||||
|
||||
int bufferOffset = (int) ((idx+i) % bufferSize);
|
||||
var buffer = buffers.get(i0);
|
||||
|
||||
final int l;
|
||||
|
||||
if (i0 < iN) l = bufferSize - bufferOffset;
|
||||
else l = Math.min(n - i, bufferSize - bufferOffset);
|
||||
|
||||
for (int p = 0; p < l; p++) {
|
||||
buffer.put(bufferOffset + p, val);
|
||||
}
|
||||
|
||||
i+=l;
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
@Override
|
||||
public void transferFromFileChannel(FileChannel sourceChannel, long destOffset, long sourceStart, long sourceEnd) throws IOException {
|
||||
|
@ -23,6 +23,11 @@ public class MultimapFileLongOffsetSlice implements MultimapFileLongSlice {
|
||||
map.put(off+idx, val);
|
||||
}
|
||||
|
||||
@Override
|
||||
public void setRange(long idx, int n, long val) {
|
||||
map.setRange(off+idx, n, val);
|
||||
}
|
||||
|
||||
@Override
|
||||
public long get(long idx) {
|
||||
return map.get(off+idx);
|
||||
|
@ -9,6 +9,8 @@ public interface MultimapFileLongSlice {
|
||||
|
||||
void put(long idx, long val);
|
||||
|
||||
void setRange(long idx, int n, long val);
|
||||
|
||||
long get(long idx);
|
||||
|
||||
void read(long[] vals, long idx);
|
||||
|
@ -1,128 +1,80 @@
|
||||
package nu.marginalia.util.multimap;
|
||||
|
||||
import lombok.experimental.Delegate;
|
||||
public interface MultimapSearcher {
|
||||
long binarySearch(long key, long fromIndex, long n);
|
||||
long binarySearchUpperBound(long key, long fromIndex, long n);
|
||||
|
||||
public class MultimapSearcher {
|
||||
@Delegate
|
||||
private final MultimapFileLongSlice mmf;
|
||||
|
||||
public MultimapSearcher(MultimapFileLongSlice mmf) {
|
||||
this.mmf = mmf;
|
||||
}
|
||||
|
||||
public boolean binarySearch(long key, long fromIndex, long toIndex) {
|
||||
|
||||
long low = fromIndex;
|
||||
long high = toIndex - 1;
|
||||
|
||||
while (low <= high) {
|
||||
long mid = (low + high) >>> 1;
|
||||
long midVal = get(mid);
|
||||
|
||||
if (midVal < key)
|
||||
low = mid + 1;
|
||||
else if (midVal > key)
|
||||
high = mid - 1;
|
||||
else
|
||||
return true; // key found
|
||||
static MultimapSearcher forContext(MultimapFileLongSlice slice, long mask, int stepSize) {
|
||||
if (mask == ~0L && stepSize == 1) {
|
||||
return new SimpleMultimapSearcher(new MultimapSearcherBase(slice));
|
||||
}
|
||||
return false; // key not found.
|
||||
}
|
||||
|
||||
public long binarySearchUpperBound(long key, long fromIndex, long toIndex) {
|
||||
|
||||
long low = fromIndex;
|
||||
long high = toIndex - 1;
|
||||
|
||||
while (low <= high) {
|
||||
long mid = (low + high) >>> 1;
|
||||
long midVal = get(mid);
|
||||
|
||||
if (midVal < key)
|
||||
low = mid + 1;
|
||||
else if (midVal > key)
|
||||
high = mid - 1;
|
||||
else
|
||||
return mid;
|
||||
else if (stepSize == 1) {
|
||||
return new MaskedMultimapSearcher(new MultimapSearcherBase(slice), mask);
|
||||
}
|
||||
return low;
|
||||
}
|
||||
|
||||
public long binarySearchUpperBound(long key, long fromIndex, long toIndex, long mask) {
|
||||
|
||||
long low = fromIndex;
|
||||
long high = toIndex - 1;
|
||||
|
||||
while (low <= high) {
|
||||
long mid = (low + high) >>> 1;
|
||||
long midVal = get(mid) & mask;
|
||||
|
||||
if (midVal < key)
|
||||
low = mid + 1;
|
||||
else if (midVal > key)
|
||||
high = mid - 1;
|
||||
else
|
||||
return mid;
|
||||
else {
|
||||
return new SteppingMaskedMultimapSearcher(new MultimapSearcherBase(slice), mask, stepSize);
|
||||
}
|
||||
return low;
|
||||
}
|
||||
|
||||
public long binarySearchUpperBoundNoMiss(long key, long fromIndex, long toIndex) {
|
||||
|
||||
long low = fromIndex;
|
||||
long high = toIndex - 1;
|
||||
|
||||
while (low <= high) {
|
||||
long mid = (low + high) >>> 1;
|
||||
long midVal = get(mid);
|
||||
|
||||
if (midVal < key)
|
||||
low = mid + 1;
|
||||
else if (midVal > key)
|
||||
high = mid - 1;
|
||||
else
|
||||
return mid;
|
||||
}
|
||||
return -1;
|
||||
}
|
||||
|
||||
|
||||
public long binarySearchUpperBoundNoMiss(long key, long fromIndex, long toIndex, long mask) {
|
||||
|
||||
long low = fromIndex;
|
||||
long high = toIndex - 1;
|
||||
|
||||
while (low <= high) {
|
||||
long mid = (low + high) >>> 1;
|
||||
long midVal = get(mid) & mask;
|
||||
|
||||
if (midVal < key)
|
||||
low = mid + 1;
|
||||
else if (midVal > key)
|
||||
high = mid - 1;
|
||||
else
|
||||
return mid;
|
||||
}
|
||||
return -1;
|
||||
}
|
||||
|
||||
|
||||
public long binarySearchUpperBoundNoMiss(long key, long fromIndex, long step, long steps, long mask) {
|
||||
|
||||
long low = 0;
|
||||
long high = steps - 1;
|
||||
|
||||
while (low <= high) {
|
||||
long mid = (low + high) >>> 1;
|
||||
long midVal = get(fromIndex + mid*step) & mask;
|
||||
|
||||
if (midVal < key)
|
||||
low = mid + 1;
|
||||
else if (midVal > key)
|
||||
high = mid - 1;
|
||||
else
|
||||
return fromIndex + mid*step;
|
||||
}
|
||||
return -1;
|
||||
}
|
||||
}
|
||||
|
||||
class SimpleMultimapSearcher implements MultimapSearcher {
|
||||
private final MultimapSearcherBase base;
|
||||
|
||||
SimpleMultimapSearcher(MultimapSearcherBase base) {
|
||||
this.base = base;
|
||||
}
|
||||
|
||||
@Override
|
||||
public long binarySearch(long key, long fromIndex, long n) {
|
||||
return base.binarySearchOffset(key, fromIndex, n);
|
||||
}
|
||||
|
||||
@Override
|
||||
public long binarySearchUpperBound(long key, long fromIndex, long n) {
|
||||
return base.binarySearchUpperBound(key, fromIndex, n);
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
class MaskedMultimapSearcher implements MultimapSearcher {
|
||||
private final MultimapSearcherBase base;
|
||||
private final long mask;
|
||||
|
||||
MaskedMultimapSearcher(MultimapSearcherBase base, long mask) {
|
||||
this.base = base;
|
||||
this.mask = mask;
|
||||
}
|
||||
|
||||
@Override
|
||||
public long binarySearch(long key, long fromIndex, long n) {
|
||||
return base.binarySearchOffset(key, fromIndex, n, mask);
|
||||
}
|
||||
|
||||
@Override
|
||||
public long binarySearchUpperBound(long key, long fromIndex, long n) {
|
||||
return base.binarySearchUpperBound(key, fromIndex, n, mask);
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
class SteppingMaskedMultimapSearcher implements MultimapSearcher {
|
||||
private final MultimapSearcherBase base;
|
||||
private final long mask;
|
||||
private final int step;
|
||||
|
||||
SteppingMaskedMultimapSearcher(MultimapSearcherBase base, long mask, int step) {
|
||||
this.base = base;
|
||||
this.mask = mask;
|
||||
this.step = step;
|
||||
}
|
||||
|
||||
@Override
|
||||
public long binarySearch(long key, long fromIndex, long n) {
|
||||
return base.binarySearchOffset(key, fromIndex, step, n, mask);
|
||||
}
|
||||
|
||||
@Override
|
||||
public long binarySearchUpperBound(long key, long fromIndex, long n) {
|
||||
return base.binarySearchUpperBound(key, fromIndex, step, n, mask);
|
||||
}
|
||||
}
|
@ -0,0 +1,143 @@
|
||||
package nu.marginalia.util.multimap;
|
||||
|
||||
import lombok.experimental.Delegate;
|
||||
|
||||
public class MultimapSearcherBase {
|
||||
@Delegate
|
||||
private final MultimapFileLongSlice mmf;
|
||||
|
||||
public MultimapSearcherBase(MultimapFileLongSlice mmf) {
|
||||
this.mmf = mmf;
|
||||
}
|
||||
|
||||
public boolean binarySearchTest(long key, long fromIndex, long n) {
|
||||
|
||||
long low = 0;
|
||||
long high = n - 1;
|
||||
|
||||
while (low <= high) {
|
||||
long mid = (low + high) >>> 1;
|
||||
long midVal = get(fromIndex + mid);
|
||||
|
||||
if (midVal < key)
|
||||
low = mid + 1;
|
||||
else if (midVal > key)
|
||||
high = mid - 1;
|
||||
else
|
||||
return true;
|
||||
}
|
||||
return false;
|
||||
}
|
||||
|
||||
public long binarySearchOffset(long key, long fromIndex, long n) {
|
||||
long low = 0;
|
||||
long high = n - 1;
|
||||
|
||||
while (low <= high) {
|
||||
long mid = (low + high) >>> 1;
|
||||
long midVal = get(fromIndex + mid);
|
||||
|
||||
if (midVal < key)
|
||||
low = mid + 1;
|
||||
else if (midVal > key)
|
||||
high = mid - 1;
|
||||
else
|
||||
return fromIndex + mid;
|
||||
}
|
||||
return fromIndex + low;
|
||||
}
|
||||
|
||||
|
||||
public long binarySearchOffset(long key, long fromIndex, long n, long mask) {
|
||||
long low = 0;
|
||||
long high = n - 1;
|
||||
|
||||
while (low <= high) {
|
||||
long mid = (low + high) >>> 1;
|
||||
long midVal = get(fromIndex + mid) & mask;
|
||||
|
||||
if (midVal < key)
|
||||
low = mid + 1;
|
||||
else if (midVal > key)
|
||||
high = mid - 1;
|
||||
else
|
||||
return fromIndex + mid;
|
||||
}
|
||||
return fromIndex + low;
|
||||
}
|
||||
|
||||
|
||||
public long binarySearchOffset(long key, long fromIndex, int step, long n, long mask) {
|
||||
long low = 0;
|
||||
long high = n - 1;
|
||||
|
||||
while (low <= high) {
|
||||
long mid = (low + high) >>> 1;
|
||||
long midVal = get(fromIndex + mid*step) & mask;
|
||||
|
||||
if (midVal < key)
|
||||
low = mid + 1;
|
||||
else if (midVal > key)
|
||||
high = mid - 1;
|
||||
else
|
||||
return fromIndex + mid*step;
|
||||
}
|
||||
return fromIndex + low;
|
||||
}
|
||||
|
||||
public long binarySearchUpperBound(long key, long fromIndex, long n) {
|
||||
long low = 0;
|
||||
long high = n - 1;
|
||||
|
||||
while (low <= high) {
|
||||
long mid = (low + high) >>> 1;
|
||||
long midVal = get(fromIndex + mid);
|
||||
|
||||
if (midVal < key)
|
||||
low = mid + 1;
|
||||
else if (midVal > key)
|
||||
high = mid - 1;
|
||||
else
|
||||
return fromIndex + mid;
|
||||
}
|
||||
return -1;
|
||||
}
|
||||
|
||||
|
||||
public long binarySearchUpperBound(long key, long fromIndex, long n, long mask) {
|
||||
long low = 0;
|
||||
long high = n - 1;
|
||||
|
||||
while (low <= high) {
|
||||
long mid = (low + high) >>> 1;
|
||||
long midVal = get(fromIndex + mid) & mask;
|
||||
|
||||
if (midVal < key)
|
||||
low = mid + 1;
|
||||
else if (midVal > key)
|
||||
high = mid - 1;
|
||||
else
|
||||
return fromIndex + mid;
|
||||
}
|
||||
return -1;
|
||||
}
|
||||
|
||||
|
||||
public long binarySearchUpperBound(long key, long fromIndex, int step, long n, long mask) {
|
||||
long low = 0;
|
||||
long high = n - 1;
|
||||
|
||||
while (low <= high) {
|
||||
long mid = (low + high) >>> 1;
|
||||
long midVal = get(fromIndex + mid*step) & mask;
|
||||
|
||||
if (midVal < key)
|
||||
low = mid + 1;
|
||||
else if (midVal > key)
|
||||
high = mid - 1;
|
||||
else
|
||||
return fromIndex + mid*step;
|
||||
}
|
||||
return -1;
|
||||
}
|
||||
}
|
@ -45,12 +45,12 @@ public class IndexWordsTable implements AutoCloseable {
|
||||
|
||||
private static MultimapFileLong openWordsFile(RandomAccessFile wordsFile) throws IOException {
|
||||
return new MultimapFileLong(wordsFile,
|
||||
FileChannel.MapMode.READ_ONLY, wordsFile.length(), BUFFER_SIZE, false);
|
||||
FileChannel.MapMode.READ_ONLY, wordsFile.length(), BUFFER_SIZE);
|
||||
}
|
||||
|
||||
public long positionForWord(int wordId) {
|
||||
|
||||
long offset = reader.offsetForEntry(header, wordId);
|
||||
long offset = reader.findEntry(header, wordId);
|
||||
if (offset < 0) {
|
||||
return -1L;
|
||||
}
|
||||
@ -60,7 +60,7 @@ public class IndexWordsTable implements AutoCloseable {
|
||||
|
||||
public int wordLength(int wordId) {
|
||||
|
||||
long offset = reader.offsetForEntry(header, wordId);
|
||||
long offset = reader.findEntry(header, wordId);
|
||||
if (offset < 0) {
|
||||
return -1;
|
||||
}
|
||||
|
@ -82,7 +82,7 @@ public class SearchIndex implements AutoCloseable {
|
||||
if (!range.isPresent())
|
||||
return false;
|
||||
|
||||
return bTreeReader.offsetForEntry(bTreeReader.getHeader(range.dataOffset), url) >= 0;
|
||||
return bTreeReader.findEntry(bTreeReader.getHeader(range.dataOffset), url) >= 0;
|
||||
}
|
||||
|
||||
public class UrlIndexTree {
|
||||
|
@ -48,9 +48,9 @@ class BTreeWriterTest {
|
||||
@Test
|
||||
void testLayerOffset() {
|
||||
int wcub = ctx.BLOCK_SIZE_WORDS()*ctx.BLOCK_SIZE_WORDS()*ctx.BLOCK_SIZE_WORDS();
|
||||
System.out.println(writer.makeHeader(1025, wcub).relativeLayerOffset(ctx, 0));
|
||||
System.out.println(writer.makeHeader(1025, wcub).relativeLayerOffset(ctx, 1));
|
||||
System.out.println(writer.makeHeader(1025, wcub).relativeLayerOffset(ctx, 2));
|
||||
System.out.println(writer.makeHeader(1025, wcub).relativeIndexLayerOffset(ctx, 0));
|
||||
System.out.println(writer.makeHeader(1025, wcub).relativeIndexLayerOffset(ctx, 1));
|
||||
System.out.println(writer.makeHeader(1025, wcub).relativeIndexLayerOffset(ctx, 2));
|
||||
|
||||
for (int i = 0; i < 1024; i++) {
|
||||
var header = writer.makeHeader(0, i);
|
||||
@ -59,7 +59,7 @@ class BTreeWriterTest {
|
||||
printTreeLayout(i, header, ctx);
|
||||
|
||||
if (header.layers() >= 1) {
|
||||
assertEquals(1, ctx.layerSize(i, header.layers() - 1) / ctx.BLOCK_SIZE_WORDS());
|
||||
assertEquals(1, ctx.indexLayerSize(i, header.layers() - 1) / ctx.BLOCK_SIZE_WORDS());
|
||||
}
|
||||
}
|
||||
}
|
||||
@ -67,7 +67,7 @@ class BTreeWriterTest {
|
||||
private void printTreeLayout(int numEntries, BTreeHeader header, BTreeContext ctx) {
|
||||
StringJoiner sj = new StringJoiner(",");
|
||||
for (int l = 0; l < header.layers(); l++) {
|
||||
sj.add(""+ctx.layerSize(numEntries, l)/ctx.BLOCK_SIZE_WORDS());
|
||||
sj.add(""+ctx.indexLayerSize(numEntries, l)/ctx.BLOCK_SIZE_WORDS());
|
||||
}
|
||||
System.out.println(numEntries + ":" + sj);
|
||||
}
|
||||
@ -86,7 +86,7 @@ class BTreeWriterTest {
|
||||
|
||||
try {
|
||||
RandomAccessFile raf = new RandomAccessFile(tempFile.toFile(), "rw");
|
||||
MultimapFileLong mmf = new MultimapFileLong(raf, FileChannel.MapMode.READ_WRITE, 10000, 1000, true);
|
||||
MultimapFileLong mmf = new MultimapFileLong(raf, FileChannel.MapMode.READ_WRITE, 10000, 1000);
|
||||
|
||||
{
|
||||
var writer = new BTreeWriter(mmf, ctx);
|
||||
@ -103,7 +103,7 @@ class BTreeWriterTest {
|
||||
var reader = new BTreeReader(mmf, ctx);
|
||||
var header = reader.getHeader(0);
|
||||
for (int i = 0; i < data.length; i++) {
|
||||
long offset = reader.offsetForEntry(header, data[i]);
|
||||
long offset = reader.findEntry(header, data[i]);
|
||||
assertTrue(offset >= 0, "Negative offset for " + i + " -> " + offset);
|
||||
assertEquals(i, mmf.get(offset+1));
|
||||
}
|
||||
@ -129,7 +129,7 @@ class BTreeWriterTest {
|
||||
|
||||
try {
|
||||
RandomAccessFile raf = new RandomAccessFile(tempFile.toFile(), "rw");
|
||||
MultimapFileLong mmf = new MultimapFileLong(raf, FileChannel.MapMode.READ_WRITE, 10000, 1000, true);
|
||||
MultimapFileLong mmf = new MultimapFileLong(raf, FileChannel.MapMode.READ_WRITE, 10000, 1000);
|
||||
|
||||
{
|
||||
var writer = new BTreeWriter(mmf, ctx);
|
||||
@ -146,7 +146,7 @@ class BTreeWriterTest {
|
||||
var reader = new BTreeReader(mmf, ctx);
|
||||
var header = reader.getHeader(0);
|
||||
for (int i = 0; i < data.length; i++) {
|
||||
long offset = reader.offsetForEntry(header, data[i]);
|
||||
long offset = reader.findEntry(header, data[i]);
|
||||
assertTrue(offset >= 0, "Negative offset for " + i + " -> " + offset);
|
||||
assertEquals(i, mmf.get(offset+1));
|
||||
}
|
||||
@ -154,7 +154,7 @@ class BTreeWriterTest {
|
||||
for (int i = 0; i < 500; i++) {
|
||||
long val = (long)(Long.MAX_VALUE * Math.random());
|
||||
while (toPut.contains((int)val)) val = (long)(Long.MAX_VALUE * Math.random());
|
||||
assertEquals(-1, reader.offsetForEntry(header, val));
|
||||
assertEquals(-1, reader.findEntry(header, val));
|
||||
}
|
||||
}
|
||||
} catch (Exception e) {
|
||||
@ -197,7 +197,7 @@ class BTreeWriterTest {
|
||||
printTreeLayout(toPut.size(), header, ctx);
|
||||
|
||||
for (int i = 0; i < data.length; i++) {
|
||||
long offset = reader.offsetForEntry(header, data[i]);
|
||||
long offset = reader.findEntry(header, data[i]);
|
||||
assertTrue(offset >= 0, "Negative offset for " + i + " -> " + offset);
|
||||
assertEquals(data[i], mmf.get(offset));
|
||||
}
|
||||
@ -205,7 +205,7 @@ class BTreeWriterTest {
|
||||
for (int i = 0; i < 500; i++) {
|
||||
long val = (long) (Long.MAX_VALUE * Math.random());
|
||||
while (toPut.contains(val)) val = (long) (Long.MAX_VALUE * Math.random());
|
||||
assertEquals(-1, reader.offsetForEntry(header, val));
|
||||
assertEquals(-1, reader.findEntry(header, val));
|
||||
}
|
||||
}
|
||||
} catch (Exception e) {
|
||||
@ -250,7 +250,7 @@ class BTreeWriterTest {
|
||||
printTreeLayout(toPut.size(), header, ctx);
|
||||
|
||||
for (int i = 0; i < data.length; i++) {
|
||||
long offset = reader.offsetForEntry(header, data[i] & mask);
|
||||
long offset = reader.findEntry(header, data[i] & mask);
|
||||
assertTrue(offset >= 0, "Negative offset for " + i + " -> " + offset);
|
||||
assertEquals(data[i], mmf.get(offset));
|
||||
}
|
||||
@ -258,7 +258,7 @@ class BTreeWriterTest {
|
||||
for (int i = 0; i < 500; i++) {
|
||||
long val = (long) (Long.MAX_VALUE * Math.random());
|
||||
while (toPut.contains(val)) val = (long) (Long.MAX_VALUE * Math.random());
|
||||
assertEquals(-1, reader.offsetForEntry(header, val & mask));
|
||||
assertEquals(-1, reader.findEntry(header, val & mask));
|
||||
}
|
||||
}
|
||||
} catch (Exception e) {
|
||||
@ -304,7 +304,7 @@ class BTreeWriterTest {
|
||||
printTreeLayout(toPut.size(), header, ctx);
|
||||
|
||||
for (int i = 0; i < data.length; i++) {
|
||||
long offset = reader.offsetForEntry(header, data[i] & mask);
|
||||
long offset = reader.findEntry(header, data[i] & mask);
|
||||
assertTrue(offset >= 0, "Negative offset for " + i + " -> " + offset);
|
||||
assertEquals(data[i], mmf.get(offset));
|
||||
assertEquals(i, mmf.get(offset+1));
|
||||
@ -313,7 +313,7 @@ class BTreeWriterTest {
|
||||
for (int i = 0; i < 500; i++) {
|
||||
long val = (long) (Long.MAX_VALUE * Math.random());
|
||||
while (toPut.contains(val)) val = (long) (Long.MAX_VALUE * Math.random());
|
||||
assertEquals(-1, reader.offsetForEntry(header, val & mask));
|
||||
assertEquals(-1, reader.findEntry(header, val & mask));
|
||||
}
|
||||
}
|
||||
} catch (Exception e) {
|
||||
|
@ -26,7 +26,7 @@ class LongPairHashMapTest {
|
||||
|
||||
try {
|
||||
RandomAccessFile raf = new RandomAccessFile(tempFile.toFile(), "rw");
|
||||
MultimapFileLong mmf = new MultimapFileLong(raf, FileChannel.MapMode.READ_WRITE, 10000, 1000, true);
|
||||
MultimapFileLong mmf = new MultimapFileLong(raf, FileChannel.MapMode.READ_WRITE, 10000, 1000);
|
||||
var lphm = LongPairHashMap.createNew(mmf, 1024);
|
||||
toPut.forEach(i -> {
|
||||
lphm.put(new LongPairHashMap.CellData(i, i));
|
||||
@ -35,7 +35,7 @@ class LongPairHashMapTest {
|
||||
lphm.close();
|
||||
|
||||
RandomAccessFile raf2 = new RandomAccessFile(tempFile.toFile(), "rw");
|
||||
MultimapFileLong mmf2 = new MultimapFileLong(raf2, FileChannel.MapMode.READ_WRITE, 10000, 1000, true);
|
||||
MultimapFileLong mmf2 = new MultimapFileLong(raf2, FileChannel.MapMode.READ_WRITE, 10000, 1000);
|
||||
var lphm2 = LongPairHashMap.loadExisting(mmf2);
|
||||
toPut.forEach(i -> {
|
||||
Assertions.assertTrue(lphm2.get(i).isSet());
|
||||
|
@ -56,7 +56,7 @@ class MultimapFileTest {
|
||||
@SneakyThrows
|
||||
@Test
|
||||
void put() {
|
||||
var file = new MultimapFileLong(new RandomAccessFile(tmp, "rw"), FileChannel.MapMode.READ_WRITE, 32, 8, false);
|
||||
var file = new MultimapFileLong(new RandomAccessFile(tmp, "rw"), FileChannel.MapMode.READ_WRITE, 32, 8);
|
||||
for (int i = 0; i < 32; i++) {
|
||||
file.put(i, i);
|
||||
}
|
||||
@ -68,7 +68,7 @@ class MultimapFileTest {
|
||||
@SneakyThrows
|
||||
@Test
|
||||
void read() {
|
||||
var file = new MultimapFileLong(new RandomAccessFile(tmp, "rw"), FileChannel.MapMode.READ_WRITE, 32, 8, false);
|
||||
var file = new MultimapFileLong(new RandomAccessFile(tmp, "rw"), FileChannel.MapMode.READ_WRITE, 32, 8);
|
||||
for (int i = 0; i < 32; i++) {
|
||||
file.put(i, i);
|
||||
}
|
||||
@ -85,7 +85,7 @@ class MultimapFileTest {
|
||||
|
||||
@Test
|
||||
void write() throws IOException {
|
||||
var file = new MultimapFileLong(new RandomAccessFile(tmp, "rw"), FileChannel.MapMode.READ_WRITE, 32, 8, false);
|
||||
var file = new MultimapFileLong(new RandomAccessFile(tmp, "rw"), FileChannel.MapMode.READ_WRITE, 32, 8);
|
||||
|
||||
for (int i = 0; i < 32-6; i++) {
|
||||
file.write(new long[] { 0,1,2,3,4,5}, i);
|
||||
@ -98,7 +98,7 @@ class MultimapFileTest {
|
||||
|
||||
@Test
|
||||
void sortInternal() throws IOException {
|
||||
var file = new MultimapFileLong(new RandomAccessFile(tmp, "rw"), FileChannel.MapMode.READ_WRITE, 32, 8, false);
|
||||
var file = new MultimapFileLong(new RandomAccessFile(tmp, "rw"), FileChannel.MapMode.READ_WRITE, 32, 8);
|
||||
var sorter = file.createSorter(Path.of("/tmp"), 16);
|
||||
var searcher = file.createSearcher();
|
||||
for (int i = 0; i < 32; i++) {
|
||||
@ -109,13 +109,13 @@ class MultimapFileTest {
|
||||
|
||||
for (int i = 2+1; i < 16; i++) {
|
||||
assertTrue(file.get(i) > file.get(i-1));
|
||||
assertTrue(searcher.binarySearch(file.get(i), 2, 18));
|
||||
assertTrue(searcher.binarySearchTest(file.get(i), 2, 16));
|
||||
}
|
||||
}
|
||||
|
||||
@Test
|
||||
void sortExternal() throws IOException {
|
||||
var file = new MultimapFileLong(new RandomAccessFile(tmp, "rw"), FileChannel.MapMode.READ_WRITE, 32, 8, false);
|
||||
var file = new MultimapFileLong(new RandomAccessFile(tmp, "rw"), FileChannel.MapMode.READ_WRITE, 32, 8);
|
||||
var sorter = file.createSorter(Path.of("/tmp"), 2);
|
||||
var searcher = file.createSearcher();
|
||||
|
||||
@ -128,7 +128,7 @@ class MultimapFileTest {
|
||||
|
||||
for (int i = 2+1; i < 16; i++) {
|
||||
assertTrue(file.get(i) > file.get(i-1));
|
||||
assertTrue(searcher.binarySearch(file.get(i), 2, 18));
|
||||
assertTrue(searcher.binarySearchTest(file.get(i), 2, 16));
|
||||
}
|
||||
}
|
||||
|
||||
|
Loading…
Reference in New Issue
Block a user