mirror of
https://github.com/MarginaliaSearch/MarginaliaSearch.git
synced 2025-02-24 13:19:02 +00:00
(index-reverse) Added compression to priority index
The priority index documents file can be trivially compressed to a large degree. Compression schema: ``` 00b -> diff docord (E gamma) 01b -> diff domainid (E delta) + (1 + docord) (E delta) 10b -> rank (E gamma) + domainid,docord (raw) 11b -> 30 bit size header, followed by 1 raw doc id (61 bits) ```
This commit is contained in:
parent
abf7a8d78d
commit
12590d3449
@ -40,6 +40,14 @@ public class UrlIdCodec {
|
|||||||
return ((long) domainId << 26) | documentOrdinal;
|
return ((long) domainId << 26) | documentOrdinal;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/** Encode a URL id with a ranking element */
|
||||||
|
public static long encodeId(int rank, int domainId, int documentOrdinal) {
|
||||||
|
domainId &= 0x7FFF_FFFF;
|
||||||
|
documentOrdinal &= 0x03FF_FFFF;
|
||||||
|
rank &= 0x3F;
|
||||||
|
|
||||||
|
return ((long) rank << 57) | ((long) domainId << 26) | documentOrdinal;
|
||||||
|
}
|
||||||
/** Add a ranking element to an existing combined URL id.
|
/** Add a ranking element to an existing combined URL id.
|
||||||
*
|
*
|
||||||
* @param rank [0,1] the importance of the domain, low is good
|
* @param rank [0,1] the importance of the domain, low is good
|
||||||
|
@ -3,23 +3,32 @@ package nu.marginalia.index;
|
|||||||
import lombok.SneakyThrows;
|
import lombok.SneakyThrows;
|
||||||
import nu.marginalia.array.page.LongQueryBuffer;
|
import nu.marginalia.array.page.LongQueryBuffer;
|
||||||
import nu.marginalia.index.query.EntrySource;
|
import nu.marginalia.index.query.EntrySource;
|
||||||
|
import nu.marginalia.sequence.io.BitReader;
|
||||||
|
import nu.marginalia.model.id.UrlIdCodec;
|
||||||
|
|
||||||
|
import java.io.IOException;
|
||||||
|
import java.nio.ByteBuffer;
|
||||||
|
import java.nio.ByteOrder;
|
||||||
import java.nio.channels.FileChannel;
|
import java.nio.channels.FileChannel;
|
||||||
|
|
||||||
import static java.lang.Math.min;
|
|
||||||
|
|
||||||
public class PrioIndexEntrySource implements EntrySource {
|
public class PrioIndexEntrySource implements EntrySource {
|
||||||
private final String name;
|
private final String name;
|
||||||
|
|
||||||
int posL;
|
private final ByteBuffer readData = ByteBuffer.allocate(1024);
|
||||||
int endOffsetL;
|
private final BitReader bitReader = new BitReader(readData);
|
||||||
|
|
||||||
private final FileChannel docsFileChannel;
|
private final FileChannel docsFileChannel;
|
||||||
private final long dataOffsetStartB;
|
private long dataOffsetStartB;
|
||||||
private final long wordId;
|
private final long wordId;
|
||||||
|
|
||||||
|
private final int numItems;
|
||||||
|
private int readItems = 0;
|
||||||
|
|
||||||
|
int prevRank = -1;
|
||||||
|
int prevDomainId = -1;
|
||||||
|
int prevDocOrd = -1;
|
||||||
|
|
||||||
public PrioIndexEntrySource(String name,
|
public PrioIndexEntrySource(String name,
|
||||||
int numEntriesL,
|
|
||||||
FileChannel docsFileChannel,
|
FileChannel docsFileChannel,
|
||||||
long dataOffsetStartB,
|
long dataOffsetStartB,
|
||||||
long wordId)
|
long wordId)
|
||||||
@ -29,41 +38,101 @@ public class PrioIndexEntrySource implements EntrySource {
|
|||||||
this.dataOffsetStartB = dataOffsetStartB;
|
this.dataOffsetStartB = dataOffsetStartB;
|
||||||
this.wordId = wordId;
|
this.wordId = wordId;
|
||||||
|
|
||||||
posL = 0;
|
// sneaky read of the header to get item count upfront
|
||||||
endOffsetL = posL + numEntriesL;
|
|
||||||
|
try {
|
||||||
|
readData.limit(4);
|
||||||
|
|
||||||
|
int rb = docsFileChannel.read(readData, dataOffsetStartB);
|
||||||
|
assert rb == 4;
|
||||||
|
readData.flip();
|
||||||
|
numItems = readData.getInt() & 0x3FFF_FFFF;
|
||||||
|
|
||||||
|
readData.position(0);
|
||||||
|
readData.limit(0);
|
||||||
|
}
|
||||||
|
catch (IOException ex) {
|
||||||
|
throw new IllegalStateException("Failed to read index data.", ex);
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
@Override
|
@Override
|
||||||
public void skip(int n) {
|
public void skip(int n) {
|
||||||
posL += n;
|
throw new UnsupportedOperationException("Not implemented");
|
||||||
}
|
}
|
||||||
|
|
||||||
@Override
|
@Override
|
||||||
@SneakyThrows
|
@SneakyThrows
|
||||||
@SuppressWarnings("preview")
|
@SuppressWarnings("preview")
|
||||||
public void read(LongQueryBuffer buffer) {
|
public void read(LongQueryBuffer buffer) {
|
||||||
buffer.reset();
|
var outputBuffer = buffer.asByteBuffer().order(ByteOrder.LITTLE_ENDIAN);
|
||||||
buffer.end = min(buffer.end, endOffsetL - posL);
|
outputBuffer.clear();
|
||||||
|
|
||||||
var byteBuffer = buffer.data.getMemorySegment().asByteBuffer();
|
while (readItems++ < numItems && outputBuffer.hasRemaining()) {
|
||||||
byteBuffer.clear();
|
fillReadBuffer();
|
||||||
byteBuffer.limit(buffer.end * 8);
|
|
||||||
|
|
||||||
while (byteBuffer.hasRemaining()) {
|
int rank;
|
||||||
int rb = docsFileChannel.read(byteBuffer, dataOffsetStartB + posL * 8L + byteBuffer.position());
|
int domainId;
|
||||||
if (rb == -1) {
|
int docOrd;
|
||||||
throw new IllegalStateException("Unexpected end of file while reading index data.");
|
|
||||||
|
int code = bitReader.get(2);
|
||||||
|
if (code == 0b11) {
|
||||||
|
// header
|
||||||
|
bitReader.get(30); // skip 30 bits for the size header
|
||||||
|
|
||||||
|
rank = bitReader.get(7);
|
||||||
|
domainId = bitReader.get(31);
|
||||||
|
docOrd = bitReader.get(26);
|
||||||
}
|
}
|
||||||
|
else if (code == 0b10) {
|
||||||
|
rank = prevRank + bitReader.getGamma();
|
||||||
|
domainId = bitReader.get(31);
|
||||||
|
docOrd = bitReader.get(26);
|
||||||
|
}
|
||||||
|
else if (code == 0b01) {
|
||||||
|
rank = prevRank;
|
||||||
|
domainId = bitReader.getDelta() + prevDomainId;
|
||||||
|
docOrd = bitReader.getDelta() - 1;
|
||||||
|
}
|
||||||
|
else if (code == 0b00) {
|
||||||
|
rank = prevRank;
|
||||||
|
domainId = prevDomainId;
|
||||||
|
docOrd = prevDocOrd + bitReader.getGamma();
|
||||||
|
}
|
||||||
|
else {
|
||||||
|
throw new IllegalStateException("??? found code " + code);
|
||||||
}
|
}
|
||||||
|
|
||||||
posL += buffer.end;
|
long encodedId = UrlIdCodec.encodeId(rank, domainId, docOrd);
|
||||||
|
|
||||||
|
outputBuffer.putLong(
|
||||||
|
encodedId
|
||||||
|
);
|
||||||
|
|
||||||
|
prevRank = rank;
|
||||||
|
prevDomainId = domainId;
|
||||||
|
prevDocOrd = docOrd;
|
||||||
|
}
|
||||||
|
|
||||||
|
buffer.end = outputBuffer.position() / 8;
|
||||||
|
|
||||||
buffer.uniq();
|
buffer.uniq();
|
||||||
}
|
}
|
||||||
|
|
||||||
|
private void fillReadBuffer() throws IOException {
|
||||||
|
if (readData.remaining() < 8) {
|
||||||
|
readData.compact();
|
||||||
|
int rb = docsFileChannel.read(readData, dataOffsetStartB);
|
||||||
|
if (rb > 0) {
|
||||||
|
dataOffsetStartB += rb;
|
||||||
|
}
|
||||||
|
readData.flip();
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
@Override
|
@Override
|
||||||
public boolean hasMore() {
|
public boolean hasMore() {
|
||||||
return posL < endOffsetL;
|
return readItems < numItems;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
|
@ -70,20 +70,9 @@ public class PrioReverseIndexReader {
|
|||||||
if (offset < 0) // No documents
|
if (offset < 0) // No documents
|
||||||
return new EmptyEntrySource();
|
return new EmptyEntrySource();
|
||||||
|
|
||||||
// Read the number of documents
|
|
||||||
ByteBuffer buffer = ByteBuffer.allocate(8);
|
|
||||||
try {
|
|
||||||
documentsChannel.read(buffer, offset);
|
|
||||||
}
|
|
||||||
catch (IOException e) {
|
|
||||||
logger.error("Failed to read documents channel", e);
|
|
||||||
return new EmptyEntrySource();
|
|
||||||
}
|
|
||||||
|
|
||||||
return new PrioIndexEntrySource(name,
|
return new PrioIndexEntrySource(name,
|
||||||
(int) buffer.getLong(0),
|
|
||||||
documentsChannel,
|
documentsChannel,
|
||||||
offset + 8,
|
offset,
|
||||||
termId);
|
termId);
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -92,7 +81,7 @@ public class PrioReverseIndexReader {
|
|||||||
|
|
||||||
long offset = wordOffset(termId);
|
long offset = wordOffset(termId);
|
||||||
|
|
||||||
ByteBuffer buffer = ByteBuffer.allocate(8);
|
ByteBuffer buffer = ByteBuffer.allocate(4);
|
||||||
try {
|
try {
|
||||||
documentsChannel.read(buffer, offset);
|
documentsChannel.read(buffer, offset);
|
||||||
}
|
}
|
||||||
@ -101,7 +90,7 @@ public class PrioReverseIndexReader {
|
|||||||
return 0;
|
return 0;
|
||||||
}
|
}
|
||||||
|
|
||||||
return (int) buffer.getLong(0);
|
return buffer.getInt(0) & 0x3FFF_FFFF;
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -1,17 +1,26 @@
|
|||||||
package nu.marginalia.index.construction.prio;
|
package nu.marginalia.index.construction.prio;
|
||||||
|
|
||||||
import nu.marginalia.array.algo.LongArrayTransformations;
|
import nu.marginalia.array.algo.LongArrayTransformations;
|
||||||
|
import nu.marginalia.model.id.UrlIdCodec;
|
||||||
|
import nu.marginalia.sequence.io.BitWriter;
|
||||||
|
import org.slf4j.Logger;
|
||||||
|
import org.slf4j.LoggerFactory;
|
||||||
|
|
||||||
import java.io.IOException;
|
import java.io.IOException;
|
||||||
import java.nio.ByteBuffer;
|
import java.nio.ByteBuffer;
|
||||||
|
import java.nio.ByteOrder;
|
||||||
import java.nio.channels.FileChannel;
|
import java.nio.channels.FileChannel;
|
||||||
|
|
||||||
/** Constructs document ids list priority reverse index */
|
/** Constructs document ids list priority reverse index */
|
||||||
public class PrioDocIdsTransformer implements LongArrayTransformations.LongIOTransformer {
|
public class PrioDocIdsTransformer implements LongArrayTransformations.LongIOTransformer {
|
||||||
|
|
||||||
|
private static final Logger logger = LoggerFactory.getLogger(PrioDocIdsTransformer.class);
|
||||||
|
|
||||||
private final FileChannel writeChannel;
|
private final FileChannel writeChannel;
|
||||||
private final FileChannel readChannel;
|
private final FileChannel readChannel;
|
||||||
|
|
||||||
private final ByteBuffer buffer = ByteBuffer.allocate(8192);
|
private final ByteBuffer readBuffer = ByteBuffer.allocate(8192).order(ByteOrder.LITTLE_ENDIAN);
|
||||||
|
private final ByteBuffer writeBuffer = ByteBuffer.allocate(8192);
|
||||||
|
|
||||||
long startL = 0;
|
long startL = 0;
|
||||||
long writeOffsetB = 0;
|
long writeOffsetB = 0;
|
||||||
@ -33,25 +42,99 @@ public class PrioDocIdsTransformer implements LongArrayTransformations.LongIOTra
|
|||||||
}
|
}
|
||||||
|
|
||||||
readChannel.position(startL * 8);
|
readChannel.position(startL * 8);
|
||||||
|
readBuffer.clear();
|
||||||
|
writeBuffer.clear();
|
||||||
|
|
||||||
buffer.clear();
|
int toBeRead = 8 * (sizeL);
|
||||||
buffer.putLong(sizeL);
|
|
||||||
|
var bitWriter = new BitWriter(writeBuffer);
|
||||||
|
|
||||||
|
int prevRank = -1;
|
||||||
|
int prevDomainId = -1;
|
||||||
|
int prevDocOrd = -1;
|
||||||
|
boolean wroteHeader = false;
|
||||||
|
|
||||||
int toBeWrittenB = 8 * (1 + sizeL);
|
|
||||||
do {
|
do {
|
||||||
buffer.limit(Math.min(buffer.capacity(), toBeWrittenB));
|
readBuffer.limit(Math.min(readBuffer.capacity(), toBeRead));
|
||||||
readChannel.read(buffer);
|
readChannel.read(readBuffer);
|
||||||
buffer.flip();
|
readBuffer.flip();
|
||||||
|
|
||||||
while (buffer.hasRemaining()) {
|
if (!wroteHeader) {
|
||||||
int written = writeChannel.write(buffer, writeOffsetB);
|
// write 11b header
|
||||||
writeOffsetB += written;
|
bitWriter.putBits(3, 2);
|
||||||
toBeWrittenB -= written;
|
// encode number of items
|
||||||
|
bitWriter.putBits(sizeL, 30);
|
||||||
|
|
||||||
|
|
||||||
|
long firstItem = readBuffer.getLong();
|
||||||
|
|
||||||
|
prevRank = UrlIdCodec.getRank(firstItem);
|
||||||
|
prevDomainId = UrlIdCodec.getDomainId(firstItem);
|
||||||
|
prevDocOrd = UrlIdCodec.getDocumentOrdinal(firstItem);
|
||||||
|
|
||||||
|
bitWriter.putBits(prevRank, 7);
|
||||||
|
bitWriter.putBits(prevDomainId, 31);
|
||||||
|
bitWriter.putBits(prevDocOrd, 26);
|
||||||
|
|
||||||
|
wroteHeader = true;
|
||||||
}
|
}
|
||||||
|
|
||||||
buffer.clear();
|
while (readBuffer.hasRemaining()) {
|
||||||
} while (toBeWrittenB > 0);
|
long nextId = readBuffer.getLong();
|
||||||
|
|
||||||
|
// break down id components
|
||||||
|
int rank = UrlIdCodec.getRank(nextId);
|
||||||
|
int domainId = UrlIdCodec.getDomainId(nextId);
|
||||||
|
int docOrd = UrlIdCodec.getDocumentOrdinal(nextId);
|
||||||
|
|
||||||
|
// encode components
|
||||||
|
if (rank != prevRank) {
|
||||||
|
bitWriter.putBits(0b10, 2);
|
||||||
|
bitWriter.putGamma(rank - prevRank);
|
||||||
|
bitWriter.putBits(domainId, 31);
|
||||||
|
bitWriter.putBits(docOrd, 26);
|
||||||
|
}
|
||||||
|
else if (domainId != prevDomainId) {
|
||||||
|
bitWriter.putBits(0b01, 2);
|
||||||
|
bitWriter.putDelta(domainId - prevDomainId);
|
||||||
|
bitWriter.putDelta(1 + docOrd);
|
||||||
|
}
|
||||||
|
else if (docOrd != prevDocOrd) {
|
||||||
|
bitWriter.putBits(0b00, 2);
|
||||||
|
bitWriter.putGamma(docOrd - prevDocOrd);
|
||||||
|
}
|
||||||
|
else {
|
||||||
|
logger.warn("Unexpected duplicate document id: {}", nextId);
|
||||||
|
}
|
||||||
|
|
||||||
|
prevDocOrd = docOrd;
|
||||||
|
prevDomainId = domainId;
|
||||||
|
prevRank = rank;
|
||||||
|
|
||||||
|
if (writeBuffer.remaining() < 16) {
|
||||||
|
writeBuffer.flip();
|
||||||
|
int written = writeChannel.write(writeBuffer, writeOffsetB);
|
||||||
|
writeOffsetB += written;
|
||||||
|
writeBuffer.clear();
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
toBeRead -= readBuffer.limit();
|
||||||
|
readBuffer.clear();
|
||||||
|
} while (toBeRead > 0);
|
||||||
|
|
||||||
|
// write lingering data
|
||||||
|
|
||||||
|
// ensure any half-written data is flushed to the buffer
|
||||||
|
bitWriter.finishLastByte();
|
||||||
|
|
||||||
|
writeBuffer.flip();
|
||||||
|
while (writeBuffer.hasRemaining()) {
|
||||||
|
int written = writeChannel.write(writeBuffer, writeOffsetB);
|
||||||
|
writeOffsetB += written;
|
||||||
|
}
|
||||||
|
|
||||||
|
// update the start input pointer
|
||||||
startL = endL;
|
startL = endL;
|
||||||
return startOffsetB;
|
return startOffsetB;
|
||||||
}
|
}
|
||||||
|
@ -1,5 +1,7 @@
|
|||||||
package nu.marginalia.index.construction.prio;
|
package nu.marginalia.index.construction.prio;
|
||||||
|
|
||||||
|
import nu.marginalia.model.id.UrlIdCodec;
|
||||||
|
import nu.marginalia.sequence.io.BitReader;
|
||||||
import org.junit.jupiter.api.AfterEach;
|
import org.junit.jupiter.api.AfterEach;
|
||||||
import org.junit.jupiter.api.BeforeEach;
|
import org.junit.jupiter.api.BeforeEach;
|
||||||
import org.junit.jupiter.api.Test;
|
import org.junit.jupiter.api.Test;
|
||||||
@ -7,6 +9,7 @@ import org.junit.jupiter.api.Test;
|
|||||||
import java.io.DataInputStream;
|
import java.io.DataInputStream;
|
||||||
import java.io.DataOutputStream;
|
import java.io.DataOutputStream;
|
||||||
import java.io.IOException;
|
import java.io.IOException;
|
||||||
|
import java.nio.ByteBuffer;
|
||||||
import java.nio.channels.FileChannel;
|
import java.nio.channels.FileChannel;
|
||||||
import java.nio.file.Files;
|
import java.nio.file.Files;
|
||||||
import java.nio.file.Path;
|
import java.nio.file.Path;
|
||||||
@ -36,15 +39,14 @@ class PrioDocIdsTransformerTest {
|
|||||||
}
|
}
|
||||||
|
|
||||||
@Test
|
@Test
|
||||||
public void test() throws IOException {
|
public void testDomainIdDocOrd() throws IOException {
|
||||||
|
|
||||||
// Write 5 longs to the input file as data
|
// Write 5 longs to the input file as data
|
||||||
try (var dos = new DataOutputStream(Files.newOutputStream(inputFile))) {
|
try (var dos = new DataOutputStream(Files.newOutputStream(inputFile))) {
|
||||||
dos.writeLong(1);
|
dos.writeLong(UrlIdCodec.encodeId(0, 0));
|
||||||
dos.writeLong(2);
|
dos.writeLong(UrlIdCodec.encodeId(0, 1));
|
||||||
dos.writeLong(3);
|
dos.writeLong(UrlIdCodec.encodeId(1, 0));
|
||||||
dos.writeLong(4);
|
dos.writeLong(UrlIdCodec.encodeId(4, 51) | 0x7000_0000_0000_0000L);
|
||||||
dos.writeLong(5);
|
|
||||||
}
|
}
|
||||||
|
|
||||||
try (var writeChannel = (FileChannel) Files.newByteChannel(outputFile, StandardOpenOption.WRITE);
|
try (var writeChannel = (FileChannel) Files.newByteChannel(outputFile, StandardOpenOption.WRITE);
|
||||||
@ -52,19 +54,68 @@ class PrioDocIdsTransformerTest {
|
|||||||
{
|
{
|
||||||
// Transform two segments of the input file and write them to the output file with prefixed sizes
|
// Transform two segments of the input file and write them to the output file with prefixed sizes
|
||||||
var transformer = new PrioDocIdsTransformer(writeChannel, readChannel);
|
var transformer = new PrioDocIdsTransformer(writeChannel, readChannel);
|
||||||
transformer.transform(0, 3);
|
transformer.transform(0, 4);
|
||||||
transformer.transform(1, 5);
|
|
||||||
}
|
}
|
||||||
|
|
||||||
// Verify the output file
|
byte[] bytes = Files.readAllBytes(outputFile);
|
||||||
try (var dis = new DataInputStream(Files.newInputStream(outputFile))) {
|
var buffer = ByteBuffer.wrap(bytes);
|
||||||
assertEquals(3, dis.readLong());
|
|
||||||
assertEquals(1, dis.readLong());
|
|
||||||
assertEquals(2, dis.readLong());
|
BitReader reader = new BitReader(buffer);
|
||||||
assertEquals(3, dis.readLong());
|
|
||||||
assertEquals(2, dis.readLong());
|
// read the header
|
||||||
assertEquals(4, dis.readLong());
|
{
|
||||||
assertEquals(5, dis.readLong());
|
int code = reader.get(2);
|
||||||
|
int size = reader.get(30);
|
||||||
|
assertEquals(3, code);
|
||||||
|
assertEquals(4, size);
|
||||||
|
}
|
||||||
|
|
||||||
|
// read first doc id in parts
|
||||||
|
int rank = reader.get(7);
|
||||||
|
int domainId = reader.get(31);
|
||||||
|
int ordinal = reader.get(26);
|
||||||
|
|
||||||
|
assertEquals(0, rank);
|
||||||
|
assertEquals(0, domainId);
|
||||||
|
assertEquals(0, ordinal);
|
||||||
|
|
||||||
|
{
|
||||||
|
int code = reader.get(2);
|
||||||
|
assertEquals(0, code); // increment doc ordinal
|
||||||
|
|
||||||
|
int dord = reader.getGamma();
|
||||||
|
ordinal += dord;
|
||||||
|
|
||||||
|
assertEquals(1, ordinal);
|
||||||
|
}
|
||||||
|
|
||||||
|
{
|
||||||
|
int code = reader.get(2);
|
||||||
|
assertEquals(1, code); // increment doc ordinal
|
||||||
|
|
||||||
|
int diffDomainId = reader.getDelta();
|
||||||
|
domainId += diffDomainId;
|
||||||
|
assertEquals(1, domainId);
|
||||||
|
|
||||||
|
int abs_ord = reader.getDelta();
|
||||||
|
ordinal = abs_ord - 1;
|
||||||
|
assertEquals(0, ordinal);
|
||||||
|
}
|
||||||
|
|
||||||
|
{
|
||||||
|
int code = reader.get(2);
|
||||||
|
assertEquals(2, code); // increment doc ordinal
|
||||||
|
|
||||||
|
int diffRank = reader.getGamma() - 1;
|
||||||
|
rank += diffRank;
|
||||||
|
assertEquals(56, rank);
|
||||||
|
|
||||||
|
domainId = reader.get(31);
|
||||||
|
ordinal = reader.get(26);
|
||||||
|
|
||||||
|
assertEquals(4, domainId);
|
||||||
|
assertEquals(51, ordinal);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -12,6 +12,7 @@ import java.io.IOException;
|
|||||||
import java.nio.file.Files;
|
import java.nio.file.Files;
|
||||||
import java.nio.file.Path;
|
import java.nio.file.Path;
|
||||||
import java.util.ArrayList;
|
import java.util.ArrayList;
|
||||||
|
import java.util.Arrays;
|
||||||
import java.util.List;
|
import java.util.List;
|
||||||
|
|
||||||
import static nu.marginalia.index.construction.full.TestJournalFactory.*;
|
import static nu.marginalia.index.construction.full.TestJournalFactory.*;
|
||||||
@ -60,7 +61,8 @@ class PrioPreindexTest {
|
|||||||
public void testFinalizeSimple() throws IOException {
|
public void testFinalizeSimple() throws IOException {
|
||||||
var journalReader = journalFactory.createReader(
|
var journalReader = journalFactory.createReader(
|
||||||
new EntryDataWithWordMeta(100, 101, wm(50, 51)),
|
new EntryDataWithWordMeta(100, 101, wm(50, 51)),
|
||||||
new EntryDataWithWordMeta(104, 101, wm(50, 52))
|
new EntryDataWithWordMeta(104, 101, wm(50, 52)),
|
||||||
|
new EntryDataWithWordMeta(106, 101, wm(50, 52))
|
||||||
);
|
);
|
||||||
|
|
||||||
var preindex = PrioPreindex.constructPreindex(journalReader, DocIdRewriter.identity(), tempDir);
|
var preindex = PrioPreindex.constructPreindex(journalReader, DocIdRewriter.identity(), tempDir);
|
||||||
@ -79,9 +81,10 @@ class PrioPreindexTest {
|
|||||||
var lqb = new LongQueryBuffer(32);
|
var lqb = new LongQueryBuffer(32);
|
||||||
entrySource.read(lqb);
|
entrySource.read(lqb);
|
||||||
|
|
||||||
assertEquals(2, lqb.size());
|
assertEquals(3, lqb.size());
|
||||||
assertEquals(100, lqb.copyData()[0]);
|
assertEquals(100, lqb.copyData()[0]);
|
||||||
assertEquals(104, lqb.copyData()[1]);
|
assertEquals(104, lqb.copyData()[1]);
|
||||||
|
assertEquals(106, lqb.copyData()[2]);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
|
@ -6,6 +6,7 @@ import nu.marginalia.array.page.LongQueryBuffer;
|
|||||||
*/
|
*/
|
||||||
public interface EntrySource {
|
public interface EntrySource {
|
||||||
/** Skip n entries. */
|
/** Skip n entries. */
|
||||||
|
@Deprecated
|
||||||
void skip(int n);
|
void skip(int n);
|
||||||
|
|
||||||
/** Fill the buffer with entries, updating its data and length appropriately. */
|
/** Fill the buffer with entries, updating its data and length appropriately. */
|
||||||
|
@ -3,6 +3,7 @@ package nu.marginalia.array.page;
|
|||||||
import nu.marginalia.array.LongArray;
|
import nu.marginalia.array.LongArray;
|
||||||
import nu.marginalia.array.LongArrayFactory;
|
import nu.marginalia.array.LongArrayFactory;
|
||||||
|
|
||||||
|
import java.nio.ByteBuffer;
|
||||||
import java.util.Arrays;
|
import java.util.Arrays;
|
||||||
|
|
||||||
/** A buffer for long values that can be used to filter and manipulate the data.
|
/** A buffer for long values that can be used to filter and manipulate the data.
|
||||||
@ -164,6 +165,11 @@ public class LongQueryBuffer {
|
|||||||
finalizeFiltering();
|
finalizeFiltering();
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@SuppressWarnings("preview")
|
||||||
|
public ByteBuffer asByteBuffer() {
|
||||||
|
return data.getMemorySegment().asByteBuffer();
|
||||||
|
}
|
||||||
|
|
||||||
public String toString() {
|
public String toString() {
|
||||||
return getClass().getSimpleName() + "[" +
|
return getClass().getSimpleName() + "[" +
|
||||||
"read = " + read +
|
"read = " + read +
|
||||||
|
@ -120,7 +120,8 @@ public class BitWriter {
|
|||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
private void finishLastByte() {
|
/** Finish writing any partially written bit fields to the buffer */
|
||||||
|
public void finishLastByte() {
|
||||||
// It's possible we have a few bits left over that have yet to be written
|
// It's possible we have a few bits left over that have yet to be written
|
||||||
// to the underlying buffer. We need to write them out now.
|
// to the underlying buffer. We need to write them out now.
|
||||||
|
|
||||||
|
@ -324,4 +324,21 @@ class BitWriterTest {
|
|||||||
assertEquals(2, reader.getDelta());
|
assertEquals(2, reader.getDelta());
|
||||||
assertEquals(30, reader.getDelta());
|
assertEquals(30, reader.getDelta());
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@Test
|
||||||
|
void testGamma2() {
|
||||||
|
var buffer = ByteBuffer.allocate(8192);
|
||||||
|
var writer = new BitWriter(buffer);
|
||||||
|
writer.putBits(0, 2);
|
||||||
|
writer.putGamma(4);
|
||||||
|
writer.putBits(0, 2);
|
||||||
|
writer.putGamma(2);
|
||||||
|
var ret = writer.finish();
|
||||||
|
|
||||||
|
var reader = new BitReader(ret);
|
||||||
|
reader.get(2);
|
||||||
|
assertEquals(4, reader.getGamma());
|
||||||
|
reader.get(2);
|
||||||
|
assertEquals(2, reader.getGamma());
|
||||||
|
}
|
||||||
}
|
}
|
Loading…
Reference in New Issue
Block a user