(index-reverse) Added compression to priority index

The priority index documents file can be trivially compressed to a large degree.

Compression schema:
```
00b -> diff docord (E gamma)
01b -> diff domainid (E delta) + (1 + docord) (E delta)
10b -> rank (E gamma) + domainid,docord (raw)
11b -> 30 bit size header, followed by 1 raw doc id (61 bits)
```
This commit is contained in:
Viktor Lofgren 2024-07-10 18:34:07 +02:00
parent abf7a8d78d
commit 12590d3449
10 changed files with 295 additions and 67 deletions

View File

@ -40,6 +40,14 @@ public class UrlIdCodec {
return ((long) domainId << 26) | documentOrdinal; return ((long) domainId << 26) | documentOrdinal;
} }
/** Encode a URL id with a ranking element */
public static long encodeId(int rank, int domainId, int documentOrdinal) {
domainId &= 0x7FFF_FFFF;
documentOrdinal &= 0x03FF_FFFF;
rank &= 0x3F;
return ((long) rank << 57) | ((long) domainId << 26) | documentOrdinal;
}
/** Add a ranking element to an existing combined URL id. /** Add a ranking element to an existing combined URL id.
* *
* @param rank [0,1] the importance of the domain, low is good * @param rank [0,1] the importance of the domain, low is good

View File

@ -3,23 +3,32 @@ package nu.marginalia.index;
import lombok.SneakyThrows; import lombok.SneakyThrows;
import nu.marginalia.array.page.LongQueryBuffer; import nu.marginalia.array.page.LongQueryBuffer;
import nu.marginalia.index.query.EntrySource; import nu.marginalia.index.query.EntrySource;
import nu.marginalia.sequence.io.BitReader;
import nu.marginalia.model.id.UrlIdCodec;
import java.io.IOException;
import java.nio.ByteBuffer;
import java.nio.ByteOrder;
import java.nio.channels.FileChannel; import java.nio.channels.FileChannel;
import static java.lang.Math.min;
public class PrioIndexEntrySource implements EntrySource { public class PrioIndexEntrySource implements EntrySource {
private final String name; private final String name;
int posL; private final ByteBuffer readData = ByteBuffer.allocate(1024);
int endOffsetL; private final BitReader bitReader = new BitReader(readData);
private final FileChannel docsFileChannel; private final FileChannel docsFileChannel;
private final long dataOffsetStartB; private long dataOffsetStartB;
private final long wordId; private final long wordId;
private final int numItems;
private int readItems = 0;
int prevRank = -1;
int prevDomainId = -1;
int prevDocOrd = -1;
public PrioIndexEntrySource(String name, public PrioIndexEntrySource(String name,
int numEntriesL,
FileChannel docsFileChannel, FileChannel docsFileChannel,
long dataOffsetStartB, long dataOffsetStartB,
long wordId) long wordId)
@ -29,41 +38,101 @@ public class PrioIndexEntrySource implements EntrySource {
this.dataOffsetStartB = dataOffsetStartB; this.dataOffsetStartB = dataOffsetStartB;
this.wordId = wordId; this.wordId = wordId;
posL = 0; // sneaky read of the header to get item count upfront
endOffsetL = posL + numEntriesL;
try {
readData.limit(4);
int rb = docsFileChannel.read(readData, dataOffsetStartB);
assert rb == 4;
readData.flip();
numItems = readData.getInt() & 0x3FFF_FFFF;
readData.position(0);
readData.limit(0);
}
catch (IOException ex) {
throw new IllegalStateException("Failed to read index data.", ex);
}
} }
@Override @Override
public void skip(int n) { public void skip(int n) {
posL += n; throw new UnsupportedOperationException("Not implemented");
} }
@Override @Override
@SneakyThrows @SneakyThrows
@SuppressWarnings("preview") @SuppressWarnings("preview")
public void read(LongQueryBuffer buffer) { public void read(LongQueryBuffer buffer) {
buffer.reset(); var outputBuffer = buffer.asByteBuffer().order(ByteOrder.LITTLE_ENDIAN);
buffer.end = min(buffer.end, endOffsetL - posL); outputBuffer.clear();
var byteBuffer = buffer.data.getMemorySegment().asByteBuffer(); while (readItems++ < numItems && outputBuffer.hasRemaining()) {
byteBuffer.clear(); fillReadBuffer();
byteBuffer.limit(buffer.end * 8);
while (byteBuffer.hasRemaining()) { int rank;
int rb = docsFileChannel.read(byteBuffer, dataOffsetStartB + posL * 8L + byteBuffer.position()); int domainId;
if (rb == -1) { int docOrd;
throw new IllegalStateException("Unexpected end of file while reading index data.");
int code = bitReader.get(2);
if (code == 0b11) {
// header
bitReader.get(30); // skip 30 bits for the size header
rank = bitReader.get(7);
domainId = bitReader.get(31);
docOrd = bitReader.get(26);
} }
else if (code == 0b10) {
rank = prevRank + bitReader.getGamma();
domainId = bitReader.get(31);
docOrd = bitReader.get(26);
}
else if (code == 0b01) {
rank = prevRank;
domainId = bitReader.getDelta() + prevDomainId;
docOrd = bitReader.getDelta() - 1;
}
else if (code == 0b00) {
rank = prevRank;
domainId = prevDomainId;
docOrd = prevDocOrd + bitReader.getGamma();
}
else {
throw new IllegalStateException("??? found code " + code);
}
long encodedId = UrlIdCodec.encodeId(rank, domainId, docOrd);
outputBuffer.putLong(
encodedId
);
prevRank = rank;
prevDomainId = domainId;
prevDocOrd = docOrd;
} }
posL += buffer.end; buffer.end = outputBuffer.position() / 8;
buffer.uniq(); buffer.uniq();
} }
private void fillReadBuffer() throws IOException {
if (readData.remaining() < 8) {
readData.compact();
int rb = docsFileChannel.read(readData, dataOffsetStartB);
if (rb > 0) {
dataOffsetStartB += rb;
}
readData.flip();
}
}
@Override @Override
public boolean hasMore() { public boolean hasMore() {
return posL < endOffsetL; return readItems < numItems;
} }

View File

@ -70,20 +70,9 @@ public class PrioReverseIndexReader {
if (offset < 0) // No documents if (offset < 0) // No documents
return new EmptyEntrySource(); return new EmptyEntrySource();
// Read the number of documents
ByteBuffer buffer = ByteBuffer.allocate(8);
try {
documentsChannel.read(buffer, offset);
}
catch (IOException e) {
logger.error("Failed to read documents channel", e);
return new EmptyEntrySource();
}
return new PrioIndexEntrySource(name, return new PrioIndexEntrySource(name,
(int) buffer.getLong(0),
documentsChannel, documentsChannel,
offset + 8, offset,
termId); termId);
} }
@ -92,7 +81,7 @@ public class PrioReverseIndexReader {
long offset = wordOffset(termId); long offset = wordOffset(termId);
ByteBuffer buffer = ByteBuffer.allocate(8); ByteBuffer buffer = ByteBuffer.allocate(4);
try { try {
documentsChannel.read(buffer, offset); documentsChannel.read(buffer, offset);
} }
@ -101,7 +90,7 @@ public class PrioReverseIndexReader {
return 0; return 0;
} }
return (int) buffer.getLong(0); return buffer.getInt(0) & 0x3FFF_FFFF;
} }

View File

@ -1,17 +1,26 @@
package nu.marginalia.index.construction.prio; package nu.marginalia.index.construction.prio;
import nu.marginalia.array.algo.LongArrayTransformations; import nu.marginalia.array.algo.LongArrayTransformations;
import nu.marginalia.model.id.UrlIdCodec;
import nu.marginalia.sequence.io.BitWriter;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import java.io.IOException; import java.io.IOException;
import java.nio.ByteBuffer; import java.nio.ByteBuffer;
import java.nio.ByteOrder;
import java.nio.channels.FileChannel; import java.nio.channels.FileChannel;
/** Constructs document ids list priority reverse index */ /** Constructs document ids list priority reverse index */
public class PrioDocIdsTransformer implements LongArrayTransformations.LongIOTransformer { public class PrioDocIdsTransformer implements LongArrayTransformations.LongIOTransformer {
private static final Logger logger = LoggerFactory.getLogger(PrioDocIdsTransformer.class);
private final FileChannel writeChannel; private final FileChannel writeChannel;
private final FileChannel readChannel; private final FileChannel readChannel;
private final ByteBuffer buffer = ByteBuffer.allocate(8192); private final ByteBuffer readBuffer = ByteBuffer.allocate(8192).order(ByteOrder.LITTLE_ENDIAN);
private final ByteBuffer writeBuffer = ByteBuffer.allocate(8192);
long startL = 0; long startL = 0;
long writeOffsetB = 0; long writeOffsetB = 0;
@ -33,25 +42,99 @@ public class PrioDocIdsTransformer implements LongArrayTransformations.LongIOTra
} }
readChannel.position(startL * 8); readChannel.position(startL * 8);
readBuffer.clear();
writeBuffer.clear();
buffer.clear(); int toBeRead = 8 * (sizeL);
buffer.putLong(sizeL);
var bitWriter = new BitWriter(writeBuffer);
int prevRank = -1;
int prevDomainId = -1;
int prevDocOrd = -1;
boolean wroteHeader = false;
int toBeWrittenB = 8 * (1 + sizeL);
do { do {
buffer.limit(Math.min(buffer.capacity(), toBeWrittenB)); readBuffer.limit(Math.min(readBuffer.capacity(), toBeRead));
readChannel.read(buffer); readChannel.read(readBuffer);
buffer.flip(); readBuffer.flip();
while (buffer.hasRemaining()) { if (!wroteHeader) {
int written = writeChannel.write(buffer, writeOffsetB); // write 11b header
writeOffsetB += written; bitWriter.putBits(3, 2);
toBeWrittenB -= written; // encode number of items
bitWriter.putBits(sizeL, 30);
long firstItem = readBuffer.getLong();
prevRank = UrlIdCodec.getRank(firstItem);
prevDomainId = UrlIdCodec.getDomainId(firstItem);
prevDocOrd = UrlIdCodec.getDocumentOrdinal(firstItem);
bitWriter.putBits(prevRank, 7);
bitWriter.putBits(prevDomainId, 31);
bitWriter.putBits(prevDocOrd, 26);
wroteHeader = true;
} }
buffer.clear(); while (readBuffer.hasRemaining()) {
} while (toBeWrittenB > 0); long nextId = readBuffer.getLong();
// break down id components
int rank = UrlIdCodec.getRank(nextId);
int domainId = UrlIdCodec.getDomainId(nextId);
int docOrd = UrlIdCodec.getDocumentOrdinal(nextId);
// encode components
if (rank != prevRank) {
bitWriter.putBits(0b10, 2);
bitWriter.putGamma(rank - prevRank);
bitWriter.putBits(domainId, 31);
bitWriter.putBits(docOrd, 26);
}
else if (domainId != prevDomainId) {
bitWriter.putBits(0b01, 2);
bitWriter.putDelta(domainId - prevDomainId);
bitWriter.putDelta(1 + docOrd);
}
else if (docOrd != prevDocOrd) {
bitWriter.putBits(0b00, 2);
bitWriter.putGamma(docOrd - prevDocOrd);
}
else {
logger.warn("Unexpected duplicate document id: {}", nextId);
}
prevDocOrd = docOrd;
prevDomainId = domainId;
prevRank = rank;
if (writeBuffer.remaining() < 16) {
writeBuffer.flip();
int written = writeChannel.write(writeBuffer, writeOffsetB);
writeOffsetB += written;
writeBuffer.clear();
}
}
toBeRead -= readBuffer.limit();
readBuffer.clear();
} while (toBeRead > 0);
// write lingering data
// ensure any half-written data is flushed to the buffer
bitWriter.finishLastByte();
writeBuffer.flip();
while (writeBuffer.hasRemaining()) {
int written = writeChannel.write(writeBuffer, writeOffsetB);
writeOffsetB += written;
}
// update the start input pointer
startL = endL; startL = endL;
return startOffsetB; return startOffsetB;
} }

View File

@ -1,5 +1,7 @@
package nu.marginalia.index.construction.prio; package nu.marginalia.index.construction.prio;
import nu.marginalia.model.id.UrlIdCodec;
import nu.marginalia.sequence.io.BitReader;
import org.junit.jupiter.api.AfterEach; import org.junit.jupiter.api.AfterEach;
import org.junit.jupiter.api.BeforeEach; import org.junit.jupiter.api.BeforeEach;
import org.junit.jupiter.api.Test; import org.junit.jupiter.api.Test;
@ -7,6 +9,7 @@ import org.junit.jupiter.api.Test;
import java.io.DataInputStream; import java.io.DataInputStream;
import java.io.DataOutputStream; import java.io.DataOutputStream;
import java.io.IOException; import java.io.IOException;
import java.nio.ByteBuffer;
import java.nio.channels.FileChannel; import java.nio.channels.FileChannel;
import java.nio.file.Files; import java.nio.file.Files;
import java.nio.file.Path; import java.nio.file.Path;
@ -36,15 +39,14 @@ class PrioDocIdsTransformerTest {
} }
@Test @Test
public void test() throws IOException { public void testDomainIdDocOrd() throws IOException {
// Write 5 longs to the input file as data // Write 5 longs to the input file as data
try (var dos = new DataOutputStream(Files.newOutputStream(inputFile))) { try (var dos = new DataOutputStream(Files.newOutputStream(inputFile))) {
dos.writeLong(1); dos.writeLong(UrlIdCodec.encodeId(0, 0));
dos.writeLong(2); dos.writeLong(UrlIdCodec.encodeId(0, 1));
dos.writeLong(3); dos.writeLong(UrlIdCodec.encodeId(1, 0));
dos.writeLong(4); dos.writeLong(UrlIdCodec.encodeId(4, 51) | 0x7000_0000_0000_0000L);
dos.writeLong(5);
} }
try (var writeChannel = (FileChannel) Files.newByteChannel(outputFile, StandardOpenOption.WRITE); try (var writeChannel = (FileChannel) Files.newByteChannel(outputFile, StandardOpenOption.WRITE);
@ -52,19 +54,68 @@ class PrioDocIdsTransformerTest {
{ {
// Transform two segments of the input file and write them to the output file with prefixed sizes // Transform two segments of the input file and write them to the output file with prefixed sizes
var transformer = new PrioDocIdsTransformer(writeChannel, readChannel); var transformer = new PrioDocIdsTransformer(writeChannel, readChannel);
transformer.transform(0, 3); transformer.transform(0, 4);
transformer.transform(1, 5);
} }
// Verify the output file byte[] bytes = Files.readAllBytes(outputFile);
try (var dis = new DataInputStream(Files.newInputStream(outputFile))) { var buffer = ByteBuffer.wrap(bytes);
assertEquals(3, dis.readLong());
assertEquals(1, dis.readLong());
assertEquals(2, dis.readLong()); BitReader reader = new BitReader(buffer);
assertEquals(3, dis.readLong());
assertEquals(2, dis.readLong()); // read the header
assertEquals(4, dis.readLong()); {
assertEquals(5, dis.readLong()); int code = reader.get(2);
int size = reader.get(30);
assertEquals(3, code);
assertEquals(4, size);
}
// read first doc id in parts
int rank = reader.get(7);
int domainId = reader.get(31);
int ordinal = reader.get(26);
assertEquals(0, rank);
assertEquals(0, domainId);
assertEquals(0, ordinal);
{
int code = reader.get(2);
assertEquals(0, code); // increment doc ordinal
int dord = reader.getGamma();
ordinal += dord;
assertEquals(1, ordinal);
}
{
int code = reader.get(2);
assertEquals(1, code); // increment doc ordinal
int diffDomainId = reader.getDelta();
domainId += diffDomainId;
assertEquals(1, domainId);
int abs_ord = reader.getDelta();
ordinal = abs_ord - 1;
assertEquals(0, ordinal);
}
{
int code = reader.get(2);
assertEquals(2, code); // increment doc ordinal
int diffRank = reader.getGamma() - 1;
rank += diffRank;
assertEquals(56, rank);
domainId = reader.get(31);
ordinal = reader.get(26);
assertEquals(4, domainId);
assertEquals(51, ordinal);
} }
} }

View File

@ -12,6 +12,7 @@ import java.io.IOException;
import java.nio.file.Files; import java.nio.file.Files;
import java.nio.file.Path; import java.nio.file.Path;
import java.util.ArrayList; import java.util.ArrayList;
import java.util.Arrays;
import java.util.List; import java.util.List;
import static nu.marginalia.index.construction.full.TestJournalFactory.*; import static nu.marginalia.index.construction.full.TestJournalFactory.*;
@ -60,7 +61,8 @@ class PrioPreindexTest {
public void testFinalizeSimple() throws IOException { public void testFinalizeSimple() throws IOException {
var journalReader = journalFactory.createReader( var journalReader = journalFactory.createReader(
new EntryDataWithWordMeta(100, 101, wm(50, 51)), new EntryDataWithWordMeta(100, 101, wm(50, 51)),
new EntryDataWithWordMeta(104, 101, wm(50, 52)) new EntryDataWithWordMeta(104, 101, wm(50, 52)),
new EntryDataWithWordMeta(106, 101, wm(50, 52))
); );
var preindex = PrioPreindex.constructPreindex(journalReader, DocIdRewriter.identity(), tempDir); var preindex = PrioPreindex.constructPreindex(journalReader, DocIdRewriter.identity(), tempDir);
@ -79,9 +81,10 @@ class PrioPreindexTest {
var lqb = new LongQueryBuffer(32); var lqb = new LongQueryBuffer(32);
entrySource.read(lqb); entrySource.read(lqb);
assertEquals(2, lqb.size()); assertEquals(3, lqb.size());
assertEquals(100, lqb.copyData()[0]); assertEquals(100, lqb.copyData()[0]);
assertEquals(104, lqb.copyData()[1]); assertEquals(104, lqb.copyData()[1]);
assertEquals(106, lqb.copyData()[2]);
} }

View File

@ -6,6 +6,7 @@ import nu.marginalia.array.page.LongQueryBuffer;
*/ */
public interface EntrySource { public interface EntrySource {
/** Skip n entries. */ /** Skip n entries. */
@Deprecated
void skip(int n); void skip(int n);
/** Fill the buffer with entries, updating its data and length appropriately. */ /** Fill the buffer with entries, updating its data and length appropriately. */

View File

@ -3,6 +3,7 @@ package nu.marginalia.array.page;
import nu.marginalia.array.LongArray; import nu.marginalia.array.LongArray;
import nu.marginalia.array.LongArrayFactory; import nu.marginalia.array.LongArrayFactory;
import java.nio.ByteBuffer;
import java.util.Arrays; import java.util.Arrays;
/** A buffer for long values that can be used to filter and manipulate the data. /** A buffer for long values that can be used to filter and manipulate the data.
@ -164,6 +165,11 @@ public class LongQueryBuffer {
finalizeFiltering(); finalizeFiltering();
} }
@SuppressWarnings("preview")
public ByteBuffer asByteBuffer() {
return data.getMemorySegment().asByteBuffer();
}
public String toString() { public String toString() {
return getClass().getSimpleName() + "[" + return getClass().getSimpleName() + "[" +
"read = " + read + "read = " + read +

View File

@ -120,7 +120,8 @@ public class BitWriter {
} }
private void finishLastByte() { /** Finish writing any partially written bit fields to the buffer */
public void finishLastByte() {
// It's possible we have a few bits left over that have yet to be written // It's possible we have a few bits left over that have yet to be written
// to the underlying buffer. We need to write them out now. // to the underlying buffer. We need to write them out now.

View File

@ -324,4 +324,21 @@ class BitWriterTest {
assertEquals(2, reader.getDelta()); assertEquals(2, reader.getDelta());
assertEquals(30, reader.getDelta()); assertEquals(30, reader.getDelta());
} }
@Test
void testGamma2() {
var buffer = ByteBuffer.allocate(8192);
var writer = new BitWriter(buffer);
writer.putBits(0, 2);
writer.putGamma(4);
writer.putBits(0, 2);
writer.putGamma(2);
var ret = writer.finish();
var reader = new BitReader(ret);
reader.get(2);
assertEquals(4, reader.getGamma());
reader.get(2);
assertEquals(2, reader.getGamma());
}
} }