(index, WIP) Position data partially integrated with forward and reverse indexes.

There's no graceful way of doing this in small commits, pushing to avoid the risk of data loss.
This commit is contained in:
Viktor Lofgren 2024-06-06 12:54:52 +02:00
parent 9b922af075
commit 4a8afa6b9f
42 changed files with 1019 additions and 718 deletions

View File

@ -1,6 +1,5 @@
package nu.marginalia.keyword.model;
import nu.marginalia.model.idx.WordMetadata;
import nu.marginalia.sequence.GammaCodedSequence;
import java.io.Serial;
@ -26,26 +25,6 @@ public final class DocumentKeywords implements Serializable {
assert keywords.length == metadata.length;
}
@Override
public String toString() {
StringBuilder sb = new StringBuilder();
sb.append(getClass().getSimpleName());
sb.append('[');
var pointer = newPointer();
while (pointer.advancePointer()) {
sb.append("\n\t ");
long metadata = pointer.getMetadata();
String keyword = pointer.getKeyword();
sb.append(keyword);
if (metadata != 0) {
sb.append("/").append(new WordMetadata(metadata));
}
}
return sb.append("\n]").toString();
}
public boolean isEmpty() {
return keywords.length == 0;
}
@ -54,11 +33,6 @@ public final class DocumentKeywords implements Serializable {
return keywords.length;
}
/** Return a pointer for traversing this structure */
public DocumentKeywordsPointer newPointer() {
return new DocumentKeywordsPointer(this);
}
}

View File

@ -41,7 +41,7 @@ public class DocumentKeywordsBuilder {
meta[i] = entry.getLongValue();
wordArray[i] = entry.getKey();
positions[i] = GammaCodedSequence.generate(workArea, wordToPos.get(entry.getKey()));
positions[i] = GammaCodedSequence.generate(workArea, wordToPos.getOrDefault(entry.getKey(), IntList.of()));
}
return new DocumentKeywords(wordArray, meta, positions);

View File

@ -1,48 +0,0 @@
package nu.marginalia.keyword.model;
import nu.marginalia.sequence.GammaCodedSequence;
/** Pointer into a {@see DocumentKeywords}. It starts out before the first position,
* forward with advancePointer().
* */
public class DocumentKeywordsPointer {
private int pos = -1;
private final DocumentKeywords keywords;
DocumentKeywordsPointer(DocumentKeywords keywords) {
this.keywords = keywords;
}
/** Number of positions remaining */
public int remaining() {
return keywords.size() - Math.max(0, pos);
}
/** Return the keyword associated with the current position */
public String getKeyword() {
return keywords.keywords[pos];
}
/** Return the metadata associated with the current position */
public long getMetadata() {
return keywords.metadata[pos];
}
/** Return the positions associated with the current position */
public GammaCodedSequence getPositions() {
return keywords.positions[pos];
}
/** Advance the current position,
* returns false if this was the
* last position */
public boolean advancePointer() {
return ++pos < keywords.size();
}
/** Returns true unless the pointer is beyond the last position in the keyword set */
public boolean hasMore() {
return pos + 1 < keywords.size();
}
}

View File

@ -92,26 +92,17 @@ class DocumentKeywordExtractorTest {
);
var keywordsBuilt = keywords.build();
var ptr = keywordsBuilt.newPointer();
Map<String, WordMetadata> flags = new HashMap<>();
Map<String, GammaCodedSequence> positions = new HashMap<>();
ByteBuffer work = ByteBuffer.allocate(1024);
for (int i = 0; i < keywordsBuilt.size(); i++) {
String keyword = keywordsBuilt.keywords[i];
long metadata = keywordsBuilt.metadata[i];
while (ptr.advancePointer()) {
System.out.println(ptr.getKeyword() + " " + ptr.getMetadata() + " " + ptr.getPositions());
int[] vals = ptr.getPositions().decode().toIntArray();
for (int i = 0; i < vals.length; i++) {
vals[i] = vals[i] + 1;
}
var out = EliasGammaCodec.encode(work, vals);
System.out.println(out.capacity() + "/" + vals.length * 4);
if (Set.of("dirty", "blues").contains(ptr.getKeyword())) {
flags.put(ptr.getKeyword(), new WordMetadata(ptr.getMetadata()));
positions.put(ptr.getKeyword(), ptr.getPositions());
if (Set.of("dirty", "blues").contains(keyword)) {
flags.put(keyword, new WordMetadata(metadata));
positions.put(keyword, keywordsBuilt.positions[i]);
}
}

View File

@ -15,12 +15,14 @@ dependencies {
implementation 'org.jgrapht:jgrapht-core:1.5.2'
implementation project(':third-party:commons-codec')
implementation project(':third-party:parquet-floor')
implementation project(':code:index:api')
implementation project(':code:functions:link-graph:api')
implementation project(':code:libraries:array')
implementation project(':code:libraries:btree')
implementation project(':code:libraries:coded-sequence')
implementation project(':code:common:db')
implementation project(':code:common:config')

View File

@ -15,6 +15,7 @@ apply from: "$rootProject.projectDir/srcsets.gradle"
dependencies {
implementation project(':code:libraries:array')
implementation project(':code:libraries:btree')
implementation project(':code:libraries:coded-sequence')
implementation project(':code:index:query')
implementation project(':code:index:index-journal')
implementation project(':code:common:model')

View File

@ -2,12 +2,14 @@ package nu.marginalia.index.forward;
import lombok.SneakyThrows;
import nu.marginalia.index.domainrankings.DomainRankings;
import nu.marginalia.index.journal.model.IndexJournalEntry;
import nu.marginalia.index.journal.model.IndexJournalEntryData;
import nu.marginalia.index.journal.model.IndexJournalEntryHeader;
import nu.marginalia.index.journal.reader.IndexJournalReaderSingleFile;
import nu.marginalia.index.journal.writer.IndexJournalWriter;
import nu.marginalia.index.journal.writer.IndexJournalWriterSingleFileImpl;
import nu.marginalia.model.id.UrlIdCodec;
import nu.marginalia.process.control.FakeProcessHeartbeat;
import nu.marginalia.sequence.GammaCodedSequence;
import nu.marginalia.test.TestUtil;
import org.junit.jupiter.api.AfterEach;
import org.junit.jupiter.api.BeforeEach;
@ -69,40 +71,40 @@ class ForwardIndexConverterTest {
TestUtil.clearTempDir(dataDir);
}
public int[] getFactorsI(int id) {
return IntStream.rangeClosed(1, id).filter(v -> (id % v) == 0).toArray();
}
long createId(long url, long domain) {
return UrlIdCodec.encodeId((int) domain, (int) url);
}
public void createEntry(IndexJournalWriter writer, int id) {
int[] factors = getFactorsI(id);
var entryBuilder = IndexJournalEntry.builder(createId(id, id/20), id%5);
for (int i = 0; i+1 < factors.length; i+=2) {
entryBuilder.add(factors[i], -factors[i+1]);
}
writer.put(entryBuilder.build());
writer.put(
new IndexJournalEntryHeader(createId(id, id/20),
id%3,
(id % 5)),
new IndexJournalEntryData(
new String[]{},
new long[]{},
new GammaCodedSequence[]{}
)
);
}
@Test
void testForwardIndex() throws IOException {
new ForwardIndexConverter(new FakeProcessHeartbeat(), new IndexJournalReaderSingleFile(indexFile), docsFileId, docsFileData, new DomainRankings()).convert();
new ForwardIndexConverter(new FakeProcessHeartbeat(),
new IndexJournalReaderSingleFile(indexFile),
docsFileId,
docsFileData,
new DomainRankings()).convert();
var forwardReader = new ForwardIndexReader(docsFileId, docsFileData);
for (int i = 36; i < workSetSize; i++) {
long docId = createId(i, i/20);
assertEquals(0x00FF000000000000L | (i % 5), forwardReader.getDocMeta(docId));
assertEquals((i % 3), forwardReader.getHtmlFeatures(docId));
assertEquals(i/20, UrlIdCodec.getDomainId(docId));
}
}
}

View File

@ -13,8 +13,11 @@ java {
apply from: "$rootProject.projectDir/srcsets.gradle"
dependencies {
implementation project(':code:libraries:coded-sequence')
implementation project(':code:libraries:array')
implementation project(':code:common:model')
implementation project(':third-party:parquet-floor')
implementation project(':third-party:commons-codec')
implementation libs.bundles.slf4j
@ -23,6 +26,7 @@ dependencies {
implementation libs.guava
implementation libs.trove
implementation libs.zstd
implementation libs.fastutil
implementation libs.commons.lang3
implementation libs.roaringbitmap

View File

@ -1,27 +0,0 @@
package nu.marginalia.index.journal.model;
import nu.marginalia.model.id.UrlIdCodec;
/** An entry in the index journal.
*
* @param header the header of the entry, containing document level data
* @param data the data of the entry, containing keyword level data
*
* @see IndexJournalEntryHeader
* @see IndexJournalEntryData
*/
public record IndexJournalEntry(IndexJournalEntryHeader header, IndexJournalEntryData data) {
public static IndexJournalEntryBuilder builder(long documentId, long documentMeta) {
return new IndexJournalEntryBuilder(0, documentId, documentMeta);
}
public static IndexJournalEntryBuilder builder(int domainId,
int urlId,
long documentMeta) {
return builder(UrlIdCodec.encodeId(domainId, urlId), documentMeta);
}
}

View File

@ -1,37 +0,0 @@
package nu.marginalia.index.journal.model;
import gnu.trove.list.array.TLongArrayList;
public class IndexJournalEntryBuilder {
private final long documentId;
private final int documentFeatures;
private final long documentMeta;
private final TLongArrayList items = new TLongArrayList();
public IndexJournalEntryBuilder(
int documentFeatures,
long documentId,
long documentMeta) {
this.documentFeatures = documentFeatures;
this.documentId = documentId;
this.documentMeta = documentMeta;
}
public IndexJournalEntryBuilder add(long wordId, long metadata) {
items.add(wordId);
items.add(metadata);
return this;
}
public IndexJournalEntry build() {
return new IndexJournalEntry(
new IndexJournalEntryHeader(items.size(),
documentFeatures,
documentId,
documentMeta),
new IndexJournalEntryData(items.toArray())
);
}
}

View File

@ -1,77 +1,36 @@
package nu.marginalia.index.journal.model;
import nu.marginalia.index.journal.reader.IndexJournalReader;
import nu.marginalia.model.idx.WordMetadata;
import nu.marginalia.hash.MurmurHash3_128;
import nu.marginalia.sequence.GammaCodedSequence;
import java.util.Arrays;
import java.util.Iterator;
public record IndexJournalEntryData(long[] termIds,
long[] metadata,
GammaCodedSequence[] positions) {
/** The keyword data of an index journal entry.
* The data itself is an interleaved array of
* word ids and metadata.
* <p>
* Odd entries are term ids, even entries are encoded WordMetadata records.
* </p>
* <p>The civilized way of reading the journal data is to use an IndexJournalReader</p>
*
* @see WordMetadata
* @see IndexJournalReader
*/
public class IndexJournalEntryData implements Iterable<IndexJournalEntryData.Record> {
private final int size;
public final long[] underlyingArray;
public static final int MAX_LENGTH = 1000;
public static final int ENTRY_SIZE = 2;
public IndexJournalEntryData(long[] underlyingArray) {
this.size = underlyingArray.length;
this.underlyingArray = underlyingArray;
public IndexJournalEntryData {
assert termIds.length == metadata.length;
assert termIds.length == positions.length;
}
public IndexJournalEntryData(int size, long[] underlyingArray) {
this.size = size;
this.underlyingArray = underlyingArray;
public IndexJournalEntryData(String[] keywords,
long[] metadata,
GammaCodedSequence[] positions)
{
this(termIds(keywords), metadata, positions);
}
public long get(int idx) {
if (idx >= size)
throw new ArrayIndexOutOfBoundsException(idx + " vs " + size);
return underlyingArray[idx];
}
private static final MurmurHash3_128 hash = new MurmurHash3_128();
public int size() {
return size;
}
public long[] toArray() {
if (size == underlyingArray.length)
return underlyingArray;
else
return Arrays.copyOf(underlyingArray, size);
return termIds.length;
}
public String toString() {
return String.format("%s[%s]", getClass().getSimpleName(), Arrays.toString(toArray()));
}
public Iterator<Record> iterator() {
return new EntryIterator();
}
private class EntryIterator implements Iterator<Record> {
int pos = -ENTRY_SIZE;
public boolean hasNext() {
return pos + 2*ENTRY_SIZE - 1 < size;
}
@Override
public Record next() {
pos+=ENTRY_SIZE;
return new Record(underlyingArray[pos], underlyingArray[pos+1]);
private static long[] termIds(String[] keywords) {
long[] termIds = new long[keywords.length];
for (int i = 0; i < keywords.length; i++) {
termIds[i] = hash.hashKeyword(keywords[i]);
}
return termIds;
}
public record Record(long wordId, long metadata) {}
}

View File

@ -0,0 +1,20 @@
package nu.marginalia.index.journal.model;
import nu.marginalia.sequence.GammaCodedSequence;
/** Data corresponding to a term in a document in the index journal.
*
* @param termId the id of the term
* @param metadata the metadata of the term
* @param positions the positions of the word in the document, gamma coded
*
* @see GammaCodedSequence
*/
public record IndexJournalEntryTermData(
long termId,
long metadata,
GammaCodedSequence positions)
{
}

View File

@ -1,35 +1,29 @@
package nu.marginalia.index.journal.reader;
import nu.marginalia.index.journal.model.IndexJournalEntryData;
import nu.marginalia.index.journal.model.IndexJournalEntryHeader;
import nu.marginalia.index.journal.model.IndexJournalEntryTermData;
import nu.marginalia.model.id.UrlIdCodec;
import nu.marginalia.sequence.GammaCodedSequence;
import java.io.DataInputStream;
import java.io.IOException;
import java.nio.ByteBuffer;
import java.nio.LongBuffer;
import java.util.Iterator;
public class IndexJournalReadEntry {
public class IndexJournalReadEntry implements Iterable<IndexJournalEntryTermData> {
public final IndexJournalEntryHeader header;
private final long[] buffer;
private final ByteBuffer buffer;
private final int initialPos;
public IndexJournalReadEntry(IndexJournalEntryHeader header, long[] buffer) {
public IndexJournalReadEntry(IndexJournalEntryHeader header, ByteBuffer buffer) {
this.header = header;
this.buffer = buffer;
this.initialPos = buffer.position();
}
record WorkArea(byte[] bytes, LongBuffer buffer) {
WorkArea(byte[] bytes) {
this(bytes, ByteBuffer.wrap(bytes).asLongBuffer());
}
WorkArea() {
this(new byte[8*65536]);
}
}
static ThreadLocal<WorkArea> pool = ThreadLocal.withInitial(WorkArea::new);
static ThreadLocal<ByteBuffer> pool = ThreadLocal.withInitial(() -> ByteBuffer.allocate(8*65536));
public static IndexJournalReadEntry read(DataInputStream inputStream) throws IOException {
@ -44,13 +38,11 @@ public class IndexJournalReadEntry {
meta);
var workArea = pool.get();
inputStream.readFully(workArea.bytes, 0, 8 * header.entrySize());
long[] out = new long[header.entrySize()];
workArea.buffer.get(0, out, 0, out.length);
return new IndexJournalReadEntry(header, out);
inputStream.readFully(workArea.array(), 0, header.entrySize());
workArea.position(0);
workArea.limit(header.entrySize());
return new IndexJournalReadEntry(header, workArea);
}
public long docId() {
@ -61,12 +53,54 @@ public class IndexJournalReadEntry {
return header.documentMeta();
}
public int documentFeatures() {
return header.documentFeatures();
}
public int domainId() {
return UrlIdCodec.getDomainId(docId());
}
public IndexJournalEntryData readEntry() {
return new IndexJournalEntryData(header.entrySize(), buffer);
public void reset() {
buffer.position(initialPos);
}
public Iterator<IndexJournalEntryTermData> iterator() {
return new TermDataIterator(buffer, initialPos);
}
}
class TermDataIterator implements Iterator<IndexJournalEntryTermData> {
private final ByteBuffer buffer;
TermDataIterator(ByteBuffer buffer, int initialPos) {
this.buffer = buffer;
this.buffer.position(initialPos);
}
@Override
public boolean hasNext() {
return buffer.position() < buffer.limit();
}
@Override
public IndexJournalEntryTermData next() {
// read the metadata for the term
long termId = buffer.getLong();
long meta = buffer.getLong();
// read the size of the sequence data
int size = buffer.get() & 0xFF;
// slice the buffer to get the sequence data
var slice = buffer.slice(buffer.position(), size);
var sequence = new GammaCodedSequence(slice);
// advance the buffer position to the next term
buffer.position(buffer.position() + size);
return new IndexJournalEntryTermData(termId, meta, sequence);
}
}

View File

@ -12,6 +12,9 @@ public interface IndexJournalReader {
int FILE_HEADER_SIZE_LONGS = 2;
int FILE_HEADER_SIZE_BYTES = 8 * FILE_HEADER_SIZE_LONGS;
int DOCUMENT_HEADER_SIZE_BYTES = 24;
int TERM_HEADER_SIZE_BYTES = 17;
/** Create a reader for a single file. */
static IndexJournalReader singleFile(Path fileName) throws IOException {
return new IndexJournalReaderSingleFile(fileName);
@ -25,22 +28,23 @@ public interface IndexJournalReader {
default void forEachWordId(LongConsumer consumer) {
var ptr = this.newPointer();
while (ptr.nextDocument()) {
while (ptr.nextRecord()) {
consumer.accept(ptr.wordId());
for (var termData : ptr) {
consumer.accept(termData.termId());
}
}
}
default void forEachDocId(LongConsumer consumer) {
var ptr = this.newPointer();
while (ptr.nextDocument()) {
consumer.accept(ptr.documentId());
default void forEachDocId(LongConsumer consumer) throws IOException {
try (var ptr = this.newPointer()) {
while (ptr.nextDocument()) {
consumer.accept(ptr.documentId());
}
}
}
/** Create a new pointer to the journal. The IndexJournalPointer is
* a two-tiered iterator that allows both iteration over document records
* and their keywords
* and the terms within each document.
*/
IndexJournalPointer newPointer();

View File

@ -16,12 +16,15 @@ public class IndexJournalReaderPagingImpl implements IndexJournalReader {
private final List<IndexJournalReader> readers;
public IndexJournalReaderPagingImpl(Path baseDir) throws IOException {
var inputFiles = IndexJournalFileNames.findJournalFiles(baseDir);
if (inputFiles.isEmpty())
this(IndexJournalFileNames.findJournalFiles(baseDir));
if (readers.isEmpty())
logger.warn("Creating paging index journal file in {}, found no inputs!", baseDir);
else
logger.info("Creating paging index journal reader for {} inputs", inputFiles.size());
logger.info("Creating paging index journal reader for {} inputs", readers.size());
}
public IndexJournalReaderPagingImpl(List<Path> inputFiles) throws IOException {
this.readers = new ArrayList<>(inputFiles.size());
for (var inputFile : inputFiles) {

View File

@ -2,18 +2,20 @@ package nu.marginalia.index.journal.reader;
import com.github.luben.zstd.ZstdInputStream;
import lombok.SneakyThrows;
import nu.marginalia.index.journal.model.IndexJournalEntryData;
import nu.marginalia.index.journal.model.IndexJournalEntryTermData;
import nu.marginalia.index.journal.model.IndexJournalFileHeader;
import nu.marginalia.index.journal.reader.pointer.IndexJournalPointer;
import org.jetbrains.annotations.NotNull;
import java.io.*;
import java.nio.file.Files;
import java.nio.file.Path;
import java.nio.file.StandardOpenOption;
import java.util.Iterator;
public class IndexJournalReaderSingleFile implements IndexJournalReader {
private Path journalFile;
private final Path journalFile;
public final IndexJournalFileHeader fileHeader;
@Override
@ -58,8 +60,6 @@ class SingleFileJournalPointer implements IndexJournalPointer {
private final IndexJournalFileHeader fileHeader;
private final DataInputStream dataInputStream;
private IndexJournalReadEntry entry;
private IndexJournalEntryData entryData;
private int recordIdx = -2;
private int docIdx = -1;
public SingleFileJournalPointer(
@ -73,9 +73,6 @@ class SingleFileJournalPointer implements IndexJournalPointer {
@SneakyThrows
@Override
public boolean nextDocument() {
recordIdx = -2;
entryData = null;
if (++docIdx < fileHeader.fileSizeRecords()) {
entry = IndexJournalReadEntry.read(dataInputStream);
return true;
@ -86,19 +83,6 @@ class SingleFileJournalPointer implements IndexJournalPointer {
return false;
}
@Override
public boolean nextRecord() {
if (entryData == null) {
entryData = entry.readEntry();
}
recordIdx += 2;
if (recordIdx < entryData.size()) {
return true;
}
return false;
}
@Override
public long documentId() {
return entry.docId();
@ -109,22 +93,21 @@ class SingleFileJournalPointer implements IndexJournalPointer {
return entry.docMeta();
}
@Override
public long wordId() {
return entryData.get(recordIdx);
public int documentFeatures() { return entry.documentFeatures(); }
/** Return an iterator over the terms in the current document.
* This iterator is not valid after calling nextDocument().
*/
@NotNull
@Override
public Iterator<IndexJournalEntryTermData> iterator() {
return entry.iterator();
}
@Override
public long wordMeta() {
return entryData.get(recordIdx + 1);
}
@Override
public int documentFeatures() {
if (entryData == null) {
entryData = entry.readEntry();
}
return entry.header.documentFeatures();
public void close() throws IOException {
dataInputStream.close();
}
}

View File

@ -1,5 +1,10 @@
package nu.marginalia.index.journal.reader.pointer;
import nu.marginalia.index.journal.model.IndexJournalEntryTermData;
import org.jetbrains.annotations.NotNull;
import java.io.IOException;
import java.util.Iterator;
import java.util.function.LongPredicate;
/**
@ -13,7 +18,7 @@ import java.util.function.LongPredicate;
* nextDocument() will move the pointer from doc1 to doc2;<br>
* nextRecord() will move the pointer from word1 to word2...<br>
*/
public interface IndexJournalPointer {
public interface IndexJournalPointer extends Iterable<IndexJournalEntryTermData>, AutoCloseable {
/**
* Advance to the next document in the journal,
* returning true if such a document exists.
@ -22,11 +27,6 @@ public interface IndexJournalPointer {
*/
boolean nextDocument();
/**
* Advance to the next record in the journal
*/
boolean nextRecord();
/**
* Get the id associated with the current document
*/
@ -37,16 +37,6 @@ public interface IndexJournalPointer {
*/
long documentMeta();
/**
* Get the wordId associated with the current record
*/
long wordId();
/**
* Get the termMeta associated with the current record
*/
long wordMeta();
/**
* Get the documentFeatures associated with the current record
*/
@ -64,6 +54,8 @@ public interface IndexJournalPointer {
default IndexJournalPointer filterWordMeta(LongPredicate filter) {
return new FilteringJournalPointer(this, filter);
}
void close() throws IOException;
}
class JoiningJournalPointer implements IndexJournalPointer {
@ -86,11 +78,6 @@ class JoiningJournalPointer implements IndexJournalPointer {
return false;
}
@Override
public boolean nextRecord() {
return pointers[pIndex].nextRecord();
}
@Override
public long documentId() {
return pointers[pIndex].documentId();
@ -101,20 +88,28 @@ class JoiningJournalPointer implements IndexJournalPointer {
return pointers[pIndex].documentMeta();
}
@Override
public long wordId() {
return pointers[pIndex].wordId();
}
@Override
public long wordMeta() {
return pointers[pIndex].wordMeta();
}
@Override
public int documentFeatures() {
return pointers[pIndex].documentFeatures();
}
@NotNull
@Override
public Iterator<IndexJournalEntryTermData> iterator() {
return pointers[pIndex].iterator();
}
public void close() {
for (var p : pointers) {
try {
p.close();
} catch (Exception e) {
e.printStackTrace();
}
}
}
}
class FilteringJournalPointer implements IndexJournalPointer {
@ -128,14 +123,10 @@ class FilteringJournalPointer implements IndexJournalPointer {
@Override
public boolean nextDocument() {
return base.nextDocument();
}
@Override
public boolean nextRecord() {
while (base.nextRecord()) {
if (filter.test(wordMeta()))
while (base.nextDocument()) {
if (iterator().hasNext()) {
return true;
}
}
return false;
}
@ -150,18 +141,49 @@ class FilteringJournalPointer implements IndexJournalPointer {
return base.documentMeta();
}
@Override
public long wordId() {
return base.wordId();
}
@Override
public long wordMeta() {
return base.wordMeta();
}
@Override
public int documentFeatures() {
return base.documentFeatures();
}
@NotNull
@Override
public Iterator<IndexJournalEntryTermData> iterator() {
return new Iterator<>() {
private final Iterator<IndexJournalEntryTermData> baseIter = base.iterator();
private IndexJournalEntryTermData value = null;
@Override
public boolean hasNext() {
if (value != null) {
return true;
}
while (baseIter.hasNext()) {
value = baseIter.next();
if (filter.test(value.metadata())) {
return true;
}
}
value = null;
return false;
}
@Override
public IndexJournalEntryTermData next() {
if (hasNext()) {
var ret = value;
value = null;
return ret;
} else {
throw new IllegalStateException("No more elements");
}
}
};
}
@Override
public void close() throws IOException {
base.close();
}
}

View File

@ -1,8 +1,8 @@
package nu.marginalia.index.journal.writer;
import nu.marginalia.index.journal.model.IndexJournalEntry;
import nu.marginalia.index.journal.model.IndexJournalEntryData;
import nu.marginalia.index.journal.model.IndexJournalEntryHeader;
import nu.marginalia.index.journal.model.IndexJournalEntryTermData;
import java.io.IOException;
@ -12,18 +12,7 @@ import java.io.IOException;
* @see IndexJournalWriterPagingImpl
*/
public interface IndexJournalWriter extends AutoCloseable {
/** Write an entry to the journal.
*
* @param header the header of the entry
* @param entry the data of the entry
*
* @return the number of bytes written
*/
int put(IndexJournalEntryHeader header, IndexJournalEntryData entry);
default int put(IndexJournalEntry entry) {
return put(entry.header(), entry.data());
}
void close() throws IOException;
int put(IndexJournalEntryHeader header, IndexJournalEntryData data);
}

View File

@ -49,13 +49,14 @@ public class IndexJournalWriterPagingImpl implements IndexJournalWriter {
@Override
@SneakyThrows
public int put(IndexJournalEntryHeader header, IndexJournalEntryData entry) {
public int put(IndexJournalEntryHeader header, IndexJournalEntryData data)
{
if (bytesWritten >= sizeLimitBytes) {
bytesWritten = 0;
switchToNextWriter();
}
int writtenNow = currentWriter.put(header, entry);
int writtenNow = currentWriter.put(header, data);
bytesWritten += writtenNow;
return writtenNow;

View File

@ -2,8 +2,9 @@ package nu.marginalia.index.journal.writer;
import com.github.luben.zstd.ZstdDirectBufferCompressingStream;
import lombok.SneakyThrows;
import nu.marginalia.index.journal.model.IndexJournalEntryData;
import nu.marginalia.hash.MurmurHash3_128;
import nu.marginalia.index.journal.model.IndexJournalEntryHeader;
import nu.marginalia.index.journal.model.IndexJournalEntryData;
import nu.marginalia.index.journal.reader.IndexJournalReader;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
@ -22,6 +23,8 @@ public class IndexJournalWriterSingleFileImpl implements IndexJournalWriter{
private static final int ZSTD_BUFFER_SIZE = 8192;
private static final int DATA_BUFFER_SIZE = 8192;
private final MurmurHash3_128 hasher = new MurmurHash3_128();
private final ByteBuffer dataBuffer = ByteBuffer.allocateDirect(DATA_BUFFER_SIZE);
private final ZstdDirectBufferCompressingStream compressingStream;
@ -75,36 +78,48 @@ public class IndexJournalWriterSingleFileImpl implements IndexJournalWriter{
@Override
@SneakyThrows
public int put(IndexJournalEntryHeader header, IndexJournalEntryData entry) {
public int put(IndexJournalEntryHeader header,
IndexJournalEntryData data)
{
if (dataBuffer.capacity() - dataBuffer.position() < 3*8) {
dataBuffer.flip();
compressingStream.compress(dataBuffer);
dataBuffer.clear();
}
dataBuffer.putInt(entry.size());
final long[] keywords = data.termIds();
final long[] metadata = data.metadata();
final var positions = data.positions();
int recordSize = 0; // document header size is 3 longs
for (int i = 0; i < keywords.length; i++) {
// term header size is 2 longs
recordSize += IndexJournalReader.TERM_HEADER_SIZE_BYTES + positions[i].size();
}
dataBuffer.putInt(recordSize);
dataBuffer.putInt(header.documentFeatures());
dataBuffer.putLong(header.combinedId());
dataBuffer.putLong(header.documentMeta());
for (int i = 0; i < entry.size(); ) {
int remaining = (dataBuffer.capacity() - dataBuffer.position()) / 8;
if (remaining <= 0) {
for (int i = 0; i < keywords.length; i++) {
int requiredSize = IndexJournalReader.TERM_HEADER_SIZE_BYTES + positions[i].size();
if (dataBuffer.capacity() - dataBuffer.position() < requiredSize) {
dataBuffer.flip();
compressingStream.compress(dataBuffer);
dataBuffer.clear();
}
else while (remaining-- > 0 && i < entry.size()) {
dataBuffer.putLong(entry.underlyingArray[i++]);
}
dataBuffer.putLong(keywords[i]);
dataBuffer.putLong(metadata[i]);
dataBuffer.put((byte) positions[i].size());
dataBuffer.put(positions[i].buffer());
}
numEntries++;
final int bytesWritten = 8 * ( /*header = 3 longs */ 3 + entry.size());
return bytesWritten;
return recordSize;
}
public void close() throws IOException {
@ -121,7 +136,7 @@ public class IndexJournalWriterSingleFileImpl implements IndexJournalWriter{
// Finalize the file by writing a header in the beginning
ByteBuffer header = ByteBuffer.allocate(16);
ByteBuffer header = ByteBuffer.allocate(IndexJournalReader.FILE_HEADER_SIZE_BYTES);
header.putLong(numEntries);
header.putLong(0); // reserved for future use
header.flip();

View File

@ -1,6 +1,5 @@
package nu.marginalia.index.journal;
import nu.marginalia.index.journal.model.IndexJournalEntry;
import nu.marginalia.index.journal.reader.IndexJournalReader;
import nu.marginalia.index.journal.reader.IndexJournalReaderSingleFile;
import nu.marginalia.index.journal.writer.IndexJournalWriterSingleFileImpl;
@ -18,52 +17,52 @@ import java.util.List;
import static org.junit.jupiter.api.Assertions.assertEquals;
public class IndexJournalTest {
Path tempFile;
IndexJournalReader reader;
long firstDocId = UrlIdCodec.encodeId(44, 10);
long secondDocId = UrlIdCodec.encodeId(43, 15);
@BeforeEach
public void setUp() throws IOException {
tempFile = Files.createTempFile(getClass().getSimpleName(), ".dat");
var journalWriter = new IndexJournalWriterSingleFileImpl( tempFile);
journalWriter.put(IndexJournalEntry.builder(44, 10, 55)
.add(1, 2)
.add(2, 3)
.add(3, 4)
.add(5, 6).build());
journalWriter.put(IndexJournalEntry.builder(43, 15, 10)
.add(5, 5)
.add(6, 6)
.build());
journalWriter.close();
reader = new IndexJournalReaderSingleFile(tempFile);
}
@AfterEach
public void tearDown() throws IOException {
Files.delete(tempFile);
}
@Test
public void forEachDocId() {
List<Long> expected = List.of(firstDocId, secondDocId);
List<Long> actual = new ArrayList<>();
reader.forEachDocId(actual::add);
assertEquals(expected, actual);
}
@Test
public void forEachWordId() {
List<Integer> expected = List.of(1, 2, 3, 5, 5 ,6);
List<Integer> actual = new ArrayList<>();
reader.forEachWordId(i -> actual.add((int) i));
assertEquals(expected, actual);
}
// Path tempFile;
// IndexJournalReader reader;
//
// long firstDocId = UrlIdCodec.encodeId(44, 10);
// long secondDocId = UrlIdCodec.encodeId(43, 15);
//
// @BeforeEach
// public void setUp() throws IOException {
// tempFile = Files.createTempFile(getClass().getSimpleName(), ".dat");
//
// var journalWriter = new IndexJournalWriterSingleFileImpl( tempFile);
// journalWriter.put(IndexJournalEntry.builder(44, 10, 55)
// .add(1, 2)
// .add(2, 3)
// .add(3, 4)
// .add(5, 6).build());
//
// journalWriter.put(IndexJournalEntry.builder(43, 15, 10)
// .add(5, 5)
// .add(6, 6)
// .build());
// journalWriter.close();
//
// reader = new IndexJournalReaderSingleFile(tempFile);
// }
// @AfterEach
// public void tearDown() throws IOException {
// Files.delete(tempFile);
// }
//
// @Test
// public void forEachDocId() {
// List<Long> expected = List.of(firstDocId, secondDocId);
// List<Long> actual = new ArrayList<>();
//
// reader.forEachDocId(actual::add);
// assertEquals(expected, actual);
// }
//
// @Test
// public void forEachWordId() {
// List<Integer> expected = List.of(1, 2, 3, 5, 5 ,6);
// List<Integer> actual = new ArrayList<>();
//
// reader.forEachWordId(i -> actual.add((int) i));
// assertEquals(expected, actual);
// }
}

View File

@ -0,0 +1,367 @@
package nu.marginalia.index.journal;
import it.unimi.dsi.fastutil.ints.IntList;
import nu.marginalia.hash.MurmurHash3_128;
import nu.marginalia.index.journal.model.IndexJournalEntryData;
import nu.marginalia.index.journal.model.IndexJournalEntryHeader;
import nu.marginalia.index.journal.model.IndexJournalEntryTermData;
import nu.marginalia.index.journal.reader.IndexJournalReaderPagingImpl;
import nu.marginalia.index.journal.writer.IndexJournalWriterSingleFileImpl;
import nu.marginalia.index.journal.reader.IndexJournalReaderSingleFile;
import nu.marginalia.sequence.GammaCodedSequence;
import org.junit.jupiter.api.AfterEach;
import org.junit.jupiter.api.Assertions;
import org.junit.jupiter.api.BeforeEach;
import org.junit.jupiter.api.Test;
import java.io.IOException;
import java.nio.ByteBuffer;
import java.nio.file.Files;
import java.nio.file.Path;
import java.util.Iterator;
import java.util.List;
import static org.junit.jupiter.api.Assertions.*;
public class IndexJournalWriterTest {
Path tempFile;
Path tempFile2;
ByteBuffer workArea = ByteBuffer.allocate(1024);
@BeforeEach
public void setUp() throws IOException {
tempFile = Files.createTempFile(getClass().getSimpleName(), ".dat");
tempFile2 = Files.createTempFile(getClass().getSimpleName(), ".dat");
}
@AfterEach
public void tearDown() throws IOException {
Files.delete(tempFile);
Files.delete(tempFile2);
}
private GammaCodedSequence gcs(int... values) {
return GammaCodedSequence.generate(workArea, values);
}
static MurmurHash3_128 hasher = new MurmurHash3_128();
static long wordId(String str) {
return hasher.hashKeyword(str);
}
@Test
public void testSingleFile() {
try (var writer = new IndexJournalWriterSingleFileImpl(tempFile)) {
// Write two documents with two terms each
writer.put(new IndexJournalEntryHeader(11, 22, 33),
new IndexJournalEntryData(
new String[]{"word1", "word2"},
new long[]{44, 55},
new GammaCodedSequence[]{
gcs(1, 3, 5),
gcs(2, 4, 6),
})
);
writer.put(new IndexJournalEntryHeader(12, 23, 34),
new IndexJournalEntryData(
new String[]{"word1", "word2"},
new long[]{45, 56},
new GammaCodedSequence[]{
gcs(2, 4, 6),
gcs(3, 5, 7),
})
);
}
catch (IOException ex) {
Assertions.fail(ex);
}
// Read the journal back
try {
var reader = new IndexJournalReaderSingleFile(tempFile);
Iterator<IndexJournalEntryTermData> iter;
IndexJournalEntryTermData termData;
try (var ptr = reader.newPointer()) {
/** DOCUMENT 1 */
assertTrue(ptr.nextDocument());
assertEquals(11, ptr.documentId());
assertEquals(22, ptr.documentFeatures());
assertEquals(33, ptr.documentMeta());
iter = ptr.iterator();
// Term 1
assertTrue(iter.hasNext());
termData = iter.next();
assertEquals(wordId("word1"), termData.termId());
assertEquals(44, termData.metadata());
assertEquals(IntList.of(1, 3, 5), termData.positions().values());
// Term 2
assertTrue(iter.hasNext());
termData = iter.next();
assertEquals(wordId("word2"), termData.termId());
assertEquals(55, termData.metadata());
assertEquals(IntList.of(2, 4, 6), termData.positions().values());
// No more terms
assertFalse(iter.hasNext());
/** DOCUMENT 2 */
assertTrue(ptr.nextDocument());
assertEquals(12, ptr.documentId());
assertEquals(23, ptr.documentFeatures());
assertEquals(34, ptr.documentMeta());
iter = ptr.iterator();
// Term 1
assertTrue(iter.hasNext());
termData = iter.next();
assertEquals(wordId("word1"), termData.termId());
assertEquals(45, termData.metadata());
assertEquals(IntList.of(2, 4, 6), termData.positions().values());
// Term 2
assertTrue(iter.hasNext());
termData = iter.next();
assertEquals(wordId("word2"), termData.termId());
assertEquals(56, termData.metadata());
assertEquals(IntList.of(3, 5, 7), termData.positions().values());
// No more terms
assertFalse(iter.hasNext());
// No more documents
assertFalse(ptr.nextDocument());
}
}
catch (IOException ex) {
Assertions.fail(ex);
}
}
@Test
public void testMultiFile() {
try (var writer = new IndexJournalWriterSingleFileImpl(tempFile)) {
writer.put(new IndexJournalEntryHeader(11, 22, 33),
new IndexJournalEntryData(
new String[]{"word1", "word2"},
new long[]{44, 55},
new GammaCodedSequence[]{
gcs(1, 3, 5),
gcs(2, 4, 6),
})
);
}
catch (IOException ex) {
Assertions.fail(ex);
}
try (var writer = new IndexJournalWriterSingleFileImpl(tempFile2)) {
writer.put(new IndexJournalEntryHeader(12, 23, 34),
new IndexJournalEntryData(
new String[]{"word1", "word2"},
new long[]{45, 56},
new GammaCodedSequence[]{
gcs(2, 4, 6),
gcs(3, 5, 7),
})
);
}
catch (IOException ex) {
Assertions.fail(ex);
}
// Read the journal back
try {
var reader = new IndexJournalReaderPagingImpl(List.of(tempFile, tempFile2));
Iterator<IndexJournalEntryTermData> iter;
IndexJournalEntryTermData termData;
try (var ptr = reader.newPointer()) {
/** DOCUMENT 1 */
assertTrue(ptr.nextDocument());
assertEquals(11, ptr.documentId());
assertEquals(22, ptr.documentFeatures());
assertEquals(33, ptr.documentMeta());
iter = ptr.iterator();
// Term 1
assertTrue(iter.hasNext());
termData = iter.next();
assertEquals(wordId("word1"), termData.termId());
assertEquals(44, termData.metadata());
assertEquals(IntList.of(1, 3, 5), termData.positions().values());
// Term 2
assertTrue(iter.hasNext());
termData = iter.next();
assertEquals(wordId("word2"), termData.termId());
assertEquals(55, termData.metadata());
assertEquals(IntList.of(2, 4, 6), termData.positions().values());
// No more terms
assertFalse(iter.hasNext());
/** DOCUMENT 2 */
assertTrue(ptr.nextDocument());
assertEquals(12, ptr.documentId());
assertEquals(23, ptr.documentFeatures());
assertEquals(34, ptr.documentMeta());
iter = ptr.iterator();
// Term 1
assertTrue(iter.hasNext());
termData = iter.next();
assertEquals(wordId("word1"), termData.termId());
assertEquals(45, termData.metadata());
assertEquals(IntList.of(2, 4, 6), termData.positions().values());
// Term 2
assertTrue(iter.hasNext());
termData = iter.next();
assertEquals(wordId("word2"), termData.termId());
assertEquals(56, termData.metadata());
assertEquals(IntList.of(3, 5, 7), termData.positions().values());
// No more terms
assertFalse(iter.hasNext());
// No more documents
assertFalse(ptr.nextDocument());
}
}
catch (IOException ex) {
Assertions.fail(ex);
}
}
@Test
public void testSingleFileIterTwice() {
try (var writer = new IndexJournalWriterSingleFileImpl(tempFile)) {
// Write two documents with two terms each
writer.put(new IndexJournalEntryHeader(11, 22, 33),
new IndexJournalEntryData(
new String[]{"word1", "word2"},
new long[]{44, 55},
new GammaCodedSequence[]{
gcs(1, 3, 5),
gcs(2, 4, 6),
})
);
}
catch (IOException ex) {
Assertions.fail(ex);
}
// Read the journal back
try {
var reader = new IndexJournalReaderSingleFile(tempFile);
Iterator<IndexJournalEntryTermData> iter;
IndexJournalEntryTermData termData;
try (var ptr = reader.newPointer()) {
/** DOCUMENT 1 */
assertTrue(ptr.nextDocument());
assertEquals(11, ptr.documentId());
assertEquals(22, ptr.documentFeatures());
assertEquals(33, ptr.documentMeta());
iter = ptr.iterator();
// Term 1
assertTrue(iter.hasNext());
termData = iter.next();
assertEquals(wordId("word1"), termData.termId());
assertEquals(44, termData.metadata());
assertEquals(IntList.of(1, 3, 5), termData.positions().values());
// Ensure we can iterate again over the same document without persisting state or closing the pointer
iter = ptr.iterator();
// Term 1
assertTrue(iter.hasNext());
termData = iter.next();
assertEquals(wordId("word1"), termData.termId());
assertEquals(44, termData.metadata());
assertEquals(IntList.of(1, 3, 5), termData.positions().values());
}
}
catch (IOException ex) {
Assertions.fail(ex);
}
}
@Test
public void testFiltered() {
try (var writer = new IndexJournalWriterSingleFileImpl(tempFile)) {
// Write two documents with two terms each
writer.put(new IndexJournalEntryHeader(11, 22, 33),
new IndexJournalEntryData(
new String[]{"word1", "word2"},
new long[]{44, 55},
new GammaCodedSequence[]{
gcs(1, 3, 5),
gcs(2, 4, 6),
})
);
writer.put(new IndexJournalEntryHeader(12, 23, 34),
new IndexJournalEntryData(
new String[]{"word1", "word2"},
new long[]{45, 56},
new GammaCodedSequence[]{
gcs(2, 4, 6),
gcs(3, 5, 7),
}
));
}
catch (IOException ex) {
Assertions.fail(ex);
}
// Read the journal back
try {
var reader = new IndexJournalReaderSingleFile(tempFile).filtering(meta -> meta == 45);
Iterator<IndexJournalEntryTermData> iter;
IndexJournalEntryTermData termData;
try (var ptr = reader.newPointer()) {
/** DOCUMENT 2 */
assertTrue(ptr.nextDocument());
assertEquals(12, ptr.documentId());
assertEquals(23, ptr.documentFeatures());
assertEquals(34, ptr.documentMeta());
iter = ptr.iterator();
// Term 1
assertTrue(iter.hasNext());
termData = iter.next();
assertEquals(wordId("word1"), termData.termId());
assertEquals(45, termData.metadata());
assertEquals(IntList.of(2, 4, 6), termData.positions().values());
// No more terms
assertFalse(iter.hasNext());
// No more documents
assertFalse(ptr.nextDocument());
}
}
catch (IOException ex) {
Assertions.fail(ex);
}
}
}

View File

@ -9,125 +9,125 @@ import java.util.ArrayList;
import static org.junit.jupiter.api.Assertions.assertEquals;
class IndexJournalPointerTest {
@Test
public void concatenate() {
MockPointer left = new MockPointer(
List.of(new MockDocument(1, 2, 3, List.of(
new MockRecord(4, 5),
new MockRecord(6, 7))
))
);
MockPointer right = new MockPointer(
List.of(new MockDocument(8, 9, 10, List.of(
new MockRecord(11, 12),
new MockRecord(13, 14))
))
);
IndexJournalPointer concatenated = IndexJournalPointer.concatenate(left, right);
List<Long> docIdsSeq = new ArrayList<>();
List<Long> wordIdsSeq = new ArrayList<>();
while (concatenated.nextDocument()) {
docIdsSeq.add(concatenated.documentId());
while (concatenated.nextRecord()) {
wordIdsSeq.add(concatenated.wordId());
}
}
assertEquals(docIdsSeq, List.of(1L, 8L));
assertEquals(wordIdsSeq, List.of(4L, 6L, 11L, 13L));
}
@Test
public void filter() {
MockPointer left = new MockPointer(
List.of(new MockDocument(1, 2, 3, List.of(
new MockRecord(1, 1),
new MockRecord(2, 2),
new MockRecord(3, 3),
new MockRecord(4, 4),
new MockRecord(5, 5)
)
), new MockDocument(2, 2, 3, List.of(
new MockRecord(1, 1),
new MockRecord(3, 3),
new MockRecord(5, 5)
)
))
);
var filtered = left.filterWordMeta(meta -> (meta % 2) == 0);
List<Long> docIdsSeq = new ArrayList<>();
List<Long> wordIdsSeq = new ArrayList<>();
while (filtered.nextDocument()) {
docIdsSeq.add(filtered.documentId());
while (filtered.nextRecord()) {
wordIdsSeq.add(filtered.wordId());
}
}
assertEquals(docIdsSeq, List.of(1L, 2L));
assertEquals(wordIdsSeq, List.of(2L, 4L));
}
class MockPointer implements IndexJournalPointer {
private final List<MockDocument> documents;
int di = -1;
int ri;
public MockPointer(Collection<MockDocument> documents) {
this.documents = new ArrayList<>(documents);
}
@Override
public boolean nextDocument() {
if (++di < documents.size()) {
ri = -1;
return true;
}
return false;
}
@Override
public boolean nextRecord() {
if (++ri < documents.get(di).records.size()) {
return true;
}
return false;
}
@Override
public long documentId() {
return documents.get(di).docId;
}
@Override
public long documentMeta() {
return documents.get(di).docMeta;
}
@Override
public long wordId() {
return documents.get(di).records.get(ri).wordId;
}
@Override
public long wordMeta() {
return documents.get(di).records.get(ri).wordMeta;
}
@Override
public int documentFeatures() {
return documents.get(di).docFeatures;
}
}
record MockDocument(long docId, long docMeta, int docFeatures, List<MockRecord> records) {}
record MockRecord(long wordId, long wordMeta) {}
//
// @Test
// public void concatenate() {
// MockPointer left = new MockPointer(
// List.of(new MockDocument(1, 2, 3, List.of(
// new MockRecord(4, 5),
// new MockRecord(6, 7))
// ))
// );
//
// MockPointer right = new MockPointer(
// List.of(new MockDocument(8, 9, 10, List.of(
// new MockRecord(11, 12),
// new MockRecord(13, 14))
// ))
// );
//
// IndexJournalPointer concatenated = IndexJournalPointer.concatenate(left, right);
// List<Long> docIdsSeq = new ArrayList<>();
// List<Long> wordIdsSeq = new ArrayList<>();
// while (concatenated.nextDocument()) {
// docIdsSeq.add(concatenated.documentId());
// while (concatenated.nextRecord()) {
// wordIdsSeq.add(concatenated.termId());
// }
// }
//
// assertEquals(docIdsSeq, List.of(1L, 8L));
// assertEquals(wordIdsSeq, List.of(4L, 6L, 11L, 13L));
// }
//
// @Test
// public void filter() {
// MockPointer left = new MockPointer(
// List.of(new MockDocument(1, 2, 3, List.of(
// new MockRecord(1, 1),
// new MockRecord(2, 2),
// new MockRecord(3, 3),
// new MockRecord(4, 4),
// new MockRecord(5, 5)
// )
// ), new MockDocument(2, 2, 3, List.of(
// new MockRecord(1, 1),
// new MockRecord(3, 3),
// new MockRecord(5, 5)
// )
// ))
//
// );
// var filtered = left.filterWordMeta(meta -> (meta % 2) == 0);
//
// List<Long> docIdsSeq = new ArrayList<>();
// List<Long> wordIdsSeq = new ArrayList<>();
// while (filtered.nextDocument()) {
// docIdsSeq.add(filtered.documentId());
// while (filtered.nextRecord()) {
// wordIdsSeq.add(filtered.termId());
// }
// }
//
// assertEquals(docIdsSeq, List.of(1L, 2L));
// assertEquals(wordIdsSeq, List.of(2L, 4L));
// }
//
// class MockPointer implements IndexJournalPointer {
// private final List<MockDocument> documents;
//
// int di = -1;
// int ri;
//
// public MockPointer(Collection<MockDocument> documents) {
// this.documents = new ArrayList<>(documents);
// }
//
// @Override
// public boolean nextDocument() {
// if (++di < documents.size()) {
// ri = -1;
// return true;
// }
//
// return false;
// }
//
// @Override
// public boolean nextRecord() {
// if (++ri < documents.get(di).records.size()) {
// return true;
// }
//
// return false;
// }
//
// @Override
// public long documentId() {
// return documents.get(di).docId;
// }
//
// @Override
// public long documentMeta() {
// return documents.get(di).docMeta;
// }
//
// @Override
// public long termId() {
// return documents.get(di).records.get(ri).termId;
// }
//
// @Override
// public long wordMeta() {
// return documents.get(di).records.get(ri).wordMeta;
// }
//
// @Override
// public int documentFeatures() {
// return documents.get(di).docFeatures;
// }
// }
//
// record MockDocument(long docId, long docMeta, int docFeatures, List<MockRecord> records) {}
// record MockRecord(long termId, long wordMeta) {}
}

View File

@ -16,12 +16,16 @@ apply from: "$rootProject.projectDir/srcsets.gradle"
dependencies {
implementation project(':code:libraries:array')
implementation project(':code:libraries:btree')
implementation project(':code:libraries:coded-sequence')
implementation project(':code:libraries:random-write-funnel')
implementation project(':code:index:query')
implementation project(':code:index:index-journal')
implementation project(':code:common:model')
implementation project(':code:common:process')
implementation project(':third-party:parquet-floor')
implementation project(':third-party:commons-codec')
implementation libs.bundles.slf4j

View File

@ -0,0 +1,51 @@
package nu.marginalia.index.construction;
import nu.marginalia.sequence.GammaCodedSequence;
import java.io.IOException;
import java.nio.ByteBuffer;
import java.nio.channels.FileChannel;
import java.nio.file.Path;
import java.nio.file.StandardOpenOption;
public class PositionsFileConstructor implements AutoCloseable {
private final Path file;
private final FileChannel channel;
private long offset;
private final ByteBuffer workBuffer = ByteBuffer.allocate(8192);
public PositionsFileConstructor(Path file) throws IOException {
this.file = file;
channel = FileChannel.open(file, StandardOpenOption.CREATE, StandardOpenOption.WRITE);
}
/** Add a term to the positions file
* @param termMeta the term metadata
* @param positions the positions of the term
* @return the offset of the term in the file
*/
public long add(byte termMeta, GammaCodedSequence positions) throws IOException {
synchronized (file) {
var positionBuffer = positions.buffer();
int size = 1 + positionBuffer.remaining();
if (workBuffer.remaining() < size) {
workBuffer.flip();
channel.write(workBuffer);
workBuffer.clear();
}
workBuffer.put(termMeta);
workBuffer.put(positionBuffer);
offset += size;
return offset;
}
}
public void close() throws IOException {
channel.force(false);
channel.close();
}
}

View File

@ -7,6 +7,7 @@ import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import java.io.IOException;
import java.nio.file.Files;
import java.nio.file.Path;
import java.util.concurrent.atomic.AtomicInteger;
@ -48,18 +49,22 @@ public class ReverseIndexConstructor {
return;
}
Path positionsFile = tmpDir.resolve("positions.dat");
Files.deleteIfExists(positionsFile);
try (var heartbeat = processHeartbeat.createProcessTaskHeartbeat(CreateReverseIndexSteps.class, processName)) {
heartbeat.progress(CreateReverseIndexSteps.CONSTRUCT);
try (var preindexHeartbeat = processHeartbeat.createAdHocTaskHeartbeat("constructPreindexes")) {
try (var preindexHeartbeat = processHeartbeat.createAdHocTaskHeartbeat("constructPreindexes");
PositionsFileConstructor posConstructor = new PositionsFileConstructor(positionsFile);
) {
AtomicInteger progress = new AtomicInteger(0);
inputs
.parallelStream()
.map(in -> {
preindexHeartbeat.progress("PREINDEX/MERGE", progress.incrementAndGet(), inputs.size());
return construct(in);
return construct(in, posConstructor);
})
.reduce(this::merge)
.ifPresent((index) -> {
@ -73,9 +78,9 @@ public class ReverseIndexConstructor {
}
@SneakyThrows
private ReversePreindexReference construct(Path input) {
private ReversePreindexReference construct(Path input, PositionsFileConstructor positionsFileConstructor) {
return ReversePreindex
.constructPreindex(readerSource.construct(input), docIdRewriter, tmpDir)
.constructPreindex(readerSource.construct(input), positionsFileConstructor, docIdRewriter, tmpDir)
.closeToReference();
}

View File

@ -40,6 +40,7 @@ public class ReversePreindex {
* will have randomly assigned names.
*/
public static ReversePreindex constructPreindex(IndexJournalReader reader,
PositionsFileConstructor positionsFileConstructor,
DocIdRewriter docIdRewriter,
Path workDir) throws IOException
{
@ -48,7 +49,7 @@ public class ReversePreindex {
Path docsFile = Files.createTempFile(workDir, "docs", ".dat");
var segments = ReversePreindexWordSegments.construct(reader, segmentWordsFile, segmentCountsFile);
var docs = ReversePreindexDocuments.construct(docsFile, workDir, reader, docIdRewriter, segments);
var docs = ReversePreindexDocuments.construct(docsFile, workDir, reader, docIdRewriter, positionsFileConstructor, segments);
return new ReversePreindex(segments, docs);
}

View File

@ -21,6 +21,7 @@ import java.util.concurrent.TimeUnit;
* the associated ReversePreindexWordSegments data
*/
public class ReversePreindexDocuments {
private static PositionsFileConstructor positionsFileConstructor;
final Path file;
public final LongArray documents;
private static final int RECORD_SIZE_LONGS = 2;
@ -36,7 +37,9 @@ public class ReversePreindexDocuments {
Path workDir,
IndexJournalReader reader,
DocIdRewriter docIdRewriter,
PositionsFileConstructor positionsFileConstructor,
ReversePreindexWordSegments segments) throws IOException {
ReversePreindexDocuments.positionsFileConstructor = positionsFileConstructor;
createUnsortedDocsFile(docsFile, workDir, reader, segments, docIdRewriter);
@ -75,14 +78,14 @@ public class ReversePreindexDocuments {
var pointer = reader.newPointer();
while (pointer.nextDocument()) {
long rankEncodedId = docIdRewriter.rewriteDocId(pointer.documentId());
while (pointer.nextRecord()) {
long wordId = pointer.wordId();
long wordMeta = pointer.wordMeta();
for (var termData : pointer) {
long termId = termData.termId();
long offset = offsetMap.addTo(wordId, RECORD_SIZE_LONGS);
long offset = offsetMap.addTo(termId, RECORD_SIZE_LONGS);
long posOffset = positionsFileConstructor.add((byte) termData.metadata(), termData.positions());
assembly.put(offset + 0, rankEncodedId);
assembly.put(offset + 1, wordMeta);
assembly.put(offset + 1, posOffset);
}
}

View File

@ -12,7 +12,7 @@ import java.nio.file.Files;
import java.nio.file.Path;
/** A pair of file-backed arrays of sorted wordIds
* and the count of documents associated with each wordId.
* and the count of documents associated with each termId.
*/
public class ReversePreindexWordSegments {
public final LongArray wordIds;
@ -34,7 +34,7 @@ public class ReversePreindexWordSegments {
this.countsFile = countsFile;
}
/** Returns a long-long hash map where each key is a wordId,
/** Returns a long-long hash map where each key is a termId,
* and each value is the start offset of the data.
*/
public Long2LongOpenHashMap asMap(int recordSize) {
@ -188,7 +188,7 @@ public class ReversePreindexWordSegments {
if (i == fileSize) {
// We've reached the end of the iteration and there is no
// "next" wordId to fetch
// "next" termId to fetch
wordId = Long.MIN_VALUE;
return false;
}

View File

@ -2,12 +2,14 @@ package nu.marginalia.index;
import nu.marginalia.array.page.LongQueryBuffer;
import nu.marginalia.index.construction.DocIdRewriter;
import nu.marginalia.index.construction.PositionsFileConstructor;
import nu.marginalia.index.construction.ReversePreindex;
import nu.marginalia.index.construction.TestJournalFactory;
import nu.marginalia.index.construction.TestJournalFactory.EntryDataWithWordMeta;
import org.junit.jupiter.api.AfterEach;
import org.junit.jupiter.api.BeforeEach;
import org.junit.jupiter.api.Test;
import org.mockito.Mockito;
import java.io.IOException;
import java.nio.file.Files;
@ -89,7 +91,9 @@ class ReverseIndexReaderTest {
private ReverseIndexReader createIndex(EntryDataWithWordMeta... scenario) throws IOException {
var reader = journalFactory.createReader(scenario);
var preindex = ReversePreindex.constructPreindex(reader, DocIdRewriter.identity(), tempDir);
var preindex = ReversePreindex.constructPreindex(reader,
Mockito.mock(PositionsFileConstructor.class),
DocIdRewriter.identity(), tempDir);
Path docsFile = tempDir.resolve("docs.dat");

View File

@ -100,6 +100,7 @@ class ReversePreindexDocsTest {
assertEquals(expected, actual);
}
@Test
public void testDocs2() throws IOException {
var reader = journalFactory.createReader(
@ -108,7 +109,7 @@ class ReversePreindexDocsTest {
);
var segments = ReversePreindexWordSegments.construct(reader, wordsIdFile, countsFile);
var docs = ReversePreindexDocuments.construct(docsFile, tempDir, reader, DocIdRewriter.identity(), segments);
var docs = ReversePreindexDocuments.construct(docsFile, tempDir, reader, DocIdRewriter.identity(), segme.nts);
List<TestSegmentData> expected = List.of(
new TestSegmentData(-100, 0, 4, new long[] { -0xF00BA3L, 0, 0xF00BA4L, 0 }),

View File

@ -5,6 +5,7 @@ import nu.marginalia.index.journal.model.IndexJournalEntryHeader;
import nu.marginalia.index.journal.reader.IndexJournalReader;
import nu.marginalia.index.journal.reader.IndexJournalReaderSingleFile;
import nu.marginalia.index.journal.writer.IndexJournalWriterSingleFileImpl;
import nu.marginalia.sequence.GammaCodedSequence;
import java.io.IOException;
import java.nio.file.Files;
@ -60,12 +61,18 @@ public class TestJournalFactory {
var writer = new IndexJournalWriterSingleFileImpl(jf);
for (var entry : entries) {
long[] data = new long[entry.wordIds.length * 2];
for (int i = 0; i < entry.wordIds.length; i++)
data[i*2] = entry.wordIds[i];
long[] termIds = new long[entry.wordIds.length];
long[] meta = new long[entry.wordIds.length];
GammaCodedSequence[] positions = new GammaCodedSequence[entry.wordIds.length];
for (int i = 0; i < entry.wordIds.length; i++) {
termIds[i] = entry.wordIds[i];
meta[i] = 0;
positions[i] = new GammaCodedSequence(new byte[1]);
}
writer.put(new IndexJournalEntryHeader(entries.length, 0, entry.docId, entry.docMeta),
new IndexJournalEntryData(data));
new IndexJournalEntryData(termIds, meta, positions));
}
writer.close();
var ret = new IndexJournalReaderSingleFile(jf);
@ -77,14 +84,18 @@ public class TestJournalFactory {
var writer = new IndexJournalWriterSingleFileImpl(jf);
for (var entry : entries) {
long[] data = new long[entry.wordIds.length * 2];
long[] termIds = new long[entry.wordIds.length];
long[] meta = new long[entry.wordIds.length];
GammaCodedSequence[] positions = new GammaCodedSequence[entry.wordIds.length];
for (int i = 0; i < entry.wordIds.length; i++) {
data[i * 2] = entry.wordIds[i].wordId;
data[i * 2 + 1] = entry.wordIds[i].meta;
termIds[i] = entry.wordIds[i].wordId;
meta[i] = entry.wordIds[i].meta;
positions[i] = new GammaCodedSequence(new byte[1]);
}
writer.put(new IndexJournalEntryHeader(entries.length, 0, entry.docId, entry.docMeta),
new IndexJournalEntryData(data));
new IndexJournalEntryData(termIds, meta, positions));
}
writer.close();
var ret = new IndexJournalReaderSingleFile(jf);

View File

@ -8,15 +8,16 @@ import nu.marginalia.api.searchquery.model.query.SearchSpecification;
import nu.marginalia.api.searchquery.model.query.SearchQuery;
import nu.marginalia.api.searchquery.model.results.ResultRankingParameters;
import nu.marginalia.index.index.StatefulIndex;
import nu.marginalia.index.journal.model.IndexJournalEntryData;
import nu.marginalia.process.control.FakeProcessHeartbeat;
import nu.marginalia.process.control.ProcessHeartbeat;
import nu.marginalia.sequence.GammaCodedSequence;
import nu.marginalia.storage.FileStorageService;
import nu.marginalia.hash.MurmurHash3_128;
import nu.marginalia.index.construction.DocIdRewriter;
import nu.marginalia.index.construction.ReverseIndexConstructor;
import nu.marginalia.index.forward.ForwardIndexConverter;
import nu.marginalia.index.forward.ForwardIndexFileNames;
import nu.marginalia.index.journal.model.IndexJournalEntryData;
import nu.marginalia.index.journal.model.IndexJournalEntryHeader;
import nu.marginalia.index.journal.reader.IndexJournalReader;
import nu.marginalia.index.journal.writer.IndexJournalWriter;
@ -41,6 +42,7 @@ import org.junit.jupiter.api.Test;
import org.junit.jupiter.api.parallel.Execution;
import java.io.IOException;
import java.nio.ByteBuffer;
import java.nio.file.Files;
import java.nio.file.Path;
import java.sql.SQLException;
@ -300,7 +302,18 @@ public class IndexQueryServiceIntegrationSmokeTest {
"test", "test", 0., "HTML5", 0, null, 0, 10
));
indexJournalWriter.put(header, new IndexJournalEntryData(data));
String[] keywords = IntStream.range(0, factors.length).mapToObj(Integer::toString).toArray(String[]::new);
long[] metadata = new long[factors.length];
for (int i = 0; i < factors.length; i++) {
metadata[i] = new WordMetadata(i, EnumSet.of(WordFlags.Title)).encode();
}
GammaCodedSequence[] positions = new GammaCodedSequence[factors.length];
ByteBuffer wa = ByteBuffer.allocate(16);
for (int i = 0; i < factors.length; i++) {
positions[i] = GammaCodedSequence.generate(wa, i + 1);
}
indexJournalWriter.put(header, new IndexJournalEntryData(keywords, metadata, positions));
}
@SneakyThrows
@ -309,19 +322,24 @@ public class IndexQueryServiceIntegrationSmokeTest {
long fullId = UrlIdCodec.encodeId(domain, id);
var header = new IndexJournalEntryHeader(factors.length, 0, fullId, DocumentMetadata.defaultValue());
long[] data = new long[factors.length*2];
for (int i = 0; i < factors.length; i++) {
data[2*i] = hasher.hashNearlyASCII(Integer.toString(factors[i]));
data[2*i + 1] = new WordMetadata(i, EnumSet.of(WordFlags.Title)).encode();
}
ldbw.add(new DocdbUrlDetail(
fullId, new EdgeUrl("https://www.example.com/"+id),
"test", "test", 0., "HTML5", 0, null, 0, 10
));
indexJournalWriter.put(header, new IndexJournalEntryData(data));
String[] keywords = IntStream.range(0, factors.length).mapToObj(Integer::toString).toArray(String[]::new);
long[] metadata = new long[factors.length];
for (int i = 0; i < factors.length; i++) {
metadata[i] = new WordMetadata(i, EnumSet.of(WordFlags.Title)).encode();
}
GammaCodedSequence[] positions = new GammaCodedSequence[factors.length];
ByteBuffer wa = ByteBuffer.allocate(16);
for (int i = 0; i < factors.length; i++) {
positions[i] = GammaCodedSequence.generate(wa, i);
}
indexJournalWriter.put(header, new IndexJournalEntryData(keywords, metadata, positions));
}
}

View File

@ -7,13 +7,14 @@ import nu.marginalia.api.searchquery.model.query.SearchSpecification;
import nu.marginalia.api.searchquery.model.query.SearchQuery;
import nu.marginalia.api.searchquery.model.results.ResultRankingParameters;
import nu.marginalia.index.index.StatefulIndex;
import nu.marginalia.index.journal.model.IndexJournalEntryData;
import nu.marginalia.sequence.GammaCodedSequence;
import nu.marginalia.storage.FileStorageService;
import nu.marginalia.hash.MurmurHash3_128;
import nu.marginalia.index.construction.DocIdRewriter;
import nu.marginalia.index.construction.ReverseIndexConstructor;
import nu.marginalia.index.forward.ForwardIndexConverter;
import nu.marginalia.index.forward.ForwardIndexFileNames;
import nu.marginalia.index.journal.model.IndexJournalEntryData;
import nu.marginalia.index.journal.model.IndexJournalEntryHeader;
import nu.marginalia.index.journal.reader.IndexJournalReader;
import nu.marginalia.index.journal.writer.IndexJournalWriter;
@ -44,6 +45,7 @@ import org.junit.jupiter.api.parallel.Execution;
import javax.annotation.CheckReturnValue;
import java.io.IOException;
import java.net.URISyntaxException;
import java.nio.ByteBuffer;
import java.nio.file.Files;
import java.nio.file.Path;
import java.sql.SQLException;
@ -549,13 +551,13 @@ public class IndexQueryServiceIntegrationTest {
meta.documentMetadata.encode()
);
long[] dataArray = new long[words.size() * 2];
for (int i = 0; i < words.size(); i++) {
dataArray[2*i] = hasher.hashNearlyASCII(words.get(i).keyword);
dataArray[2*i+1] = words.get(i).termMetadata;
}
var entry = new IndexJournalEntryData(dataArray);
indexJournalWriter.put(header, entry);
String[] keywords = words.stream().map(w -> w.keyword).toArray(String[]::new);
long[] metadata = words.stream().map(w -> w.termMetadata).mapToLong(Long::longValue).toArray();
GammaCodedSequence[] positions = new GammaCodedSequence[words.size()]; // FIXME: positions?
Arrays.setAll(positions, i -> new GammaCodedSequence(ByteBuffer.allocate(1)));
indexJournalWriter.put(header,
new IndexJournalEntryData(keywords, metadata, positions));
});
var linkdbWriter = new DocumentDbWriter(

View File

@ -7,18 +7,30 @@ import nu.marginalia.sequence.io.BitWriter;
import java.nio.ByteBuffer;
/** Implement coding and decoding of sequences of integers using the Elias Gamma code
*
* https://en.wikipedia.org/wiki/Elias_gamma_coding
/** Implement coding and decoding of sequences of integers using the Elias Gamma code.
* The sequence is prefixed by the number of integers in the sequence, then the delta between
* each integer in the sequence is encoded using the Elias Gamma code.
* <p></p>
* <a href="https://en.wikipedia.org/wiki/Elias_gamma_coding">https://en.wikipedia.org/wiki/Elias_gamma_coding</a>
* */
public class EliasGammaCodec implements IntIterator {
private final BitReader reader;
int rem = 0;
private int last = 0;
private int next = 0;
private EliasGammaCodec(ByteBuffer buffer) {
reader = new BitReader(buffer);
int bits = reader.takeWhileZero();
if (!reader.hasMore()) {
rem = 0;
}
else {
rem = reader.get(bits);
}
}
/** Decode a sequence of integers from a ByteBuffer using the Elias Gamma code */
@ -31,7 +43,13 @@ public class EliasGammaCodec implements IntIterator {
* or equal to zero.
*/
public static ByteBuffer encode(ByteBuffer workArea, IntList sequence) {
if (sequence.isEmpty())
return ByteBuffer.allocate(0);
var writer = new BitWriter(workArea);
writer.putGammaCoded(sequence.size());
int last = 0;
for (var iter = sequence.iterator(); iter.hasNext(); ) {
@ -42,9 +60,7 @@ public class EliasGammaCodec implements IntIterator {
// can't encode zeroes
assert delta > 0 : "Sequence must be strictly increasing and may not contain zeroes or negative values";
int bits = Integer.numberOfTrailingZeros(Integer.highestOneBit(delta));
writer.put(0, bits + 1);
writer.put(delta, bits + 1);
writer.putGammaCoded(delta);
}
return writer.finish();
@ -60,16 +76,13 @@ public class EliasGammaCodec implements IntIterator {
@Override
public boolean hasNext() {
if (next > 0)
return true;
if (!reader.hasMore())
return false;
if (next > 0) return true;
if (!reader.hasMore() || --rem < 0) return false;
int bits = reader.takeWhileZero();
if (!reader.hasMore()) {
return false;
}
if (!reader.hasMore()) return false;
int delta = reader.get(bits);
last += delta;
next = last;

View File

@ -16,6 +16,8 @@ import java.util.StringJoiner;
* */
public class GammaCodedSequence implements BinarySerializable, Iterable<Integer> {
private final ByteBuffer raw;
int startPos = 0;
int startLimit = 0;
/** Create a new GammaCodedSequence from a sequence of integers.
*
@ -37,12 +39,16 @@ public class GammaCodedSequence implements BinarySerializable, Iterable<Integer>
public GammaCodedSequence(ByteBuffer bytes) {
this.raw = bytes;
startPos = bytes.position();
startLimit = bytes.limit();
}
public GammaCodedSequence(byte[] bytes) {
raw = ByteBuffer.allocate(bytes.length);
raw.put(bytes);
raw.clear();
startPos = 0;
startLimit = bytes.length;
}
/** Return the raw bytes of the sequence. */
@ -52,21 +58,29 @@ public class GammaCodedSequence implements BinarySerializable, Iterable<Integer>
return raw.array();
}
else {
raw.clear();
byte[] bytes = new byte[raw.capacity()];
raw.get(bytes, 0, bytes.length);
raw.get(0, bytes, 0, bytes.length);
return bytes;
}
}
@Override
public IntIterator iterator() {
raw.clear();
raw.position(startPos);
raw.limit(startLimit);
return EliasGammaCodec.decode(raw);
}
public IntList values() {
var intItr = iterator();
IntArrayList ret = new IntArrayList(8);
while (intItr.hasNext()) {
ret.add(intItr.nextInt());
}
return ret;
}
/** Decode the sequence into an IntList;
* this is a somewhat slow operation,
* iterating over the data directly more performant */
@ -94,4 +108,15 @@ public class GammaCodedSequence implements BinarySerializable, Iterable<Integer>
}
return sj.toString();
}
public ByteBuffer buffer() {
raw.position(startPos);
raw.limit(startLimit);
return raw;
}
public int size() {
return raw.capacity();
}
}

View File

@ -78,7 +78,7 @@ public class BitReader {
int result = 0;
for (;;) {
do {
// Ensure we have bits to read
if (bitPosition <= 0) {
if (underlying.hasRemaining())
@ -96,10 +96,8 @@ public class BitReader {
// Subtract the number of bits read from the current position
bitPosition -= zeroes;
// If bitPosition isn't zero, we've found a 1 and can stop
if (bitPosition > 0)
break;
}
// If bit position is not positive, we've found a 1 and can stop
} while (bitPosition <= 0);
return result;
}

View File

@ -72,6 +72,17 @@ public class BitWriter {
}
}
/** Write the provided value in a gamma-coded format,
* e.g. by first finding the number of significant bits,
* then writing that many zeroes, then the bits themselves
*/
public void putGammaCoded(int value) {
int bits = 1 + Integer.numberOfTrailingZeros(Integer.highestOneBit(value));
put(0, bits);
put(value, bits);
}
public ByteBuffer finish() {
finishLastByte();

View File

@ -115,16 +115,17 @@ class BitReaderTest {
}
@Test
public void testTakeWhileZeroOverInt32() {
public void testTakeWhileZeroOverInt64() {
var writer = new BitWriter(ByteBuffer.allocate(1024));
writer.put(0, 32);
writer.put(0, 32);
writer.put(0, 2);
writer.putBit(true);
var buffer = writer.finish();
var reader = new BitReader(buffer);
int val = reader.takeWhileZero();
assertEquals(34, val);
assertEquals(66, val);
assertTrue(reader.getBit());
}
}

View File

@ -4,9 +4,9 @@ import com.google.inject.Inject;
import com.google.inject.Singleton;
import lombok.SneakyThrows;
import nu.marginalia.IndexLocations;
import nu.marginalia.index.journal.model.IndexJournalEntryData;
import nu.marginalia.storage.FileStorageService;
import nu.marginalia.hash.MurmurHash3_128;
import nu.marginalia.index.journal.model.IndexJournalEntryData;
import nu.marginalia.index.journal.model.IndexJournalEntryHeader;
import nu.marginalia.index.journal.writer.IndexJournalWriterPagingImpl;
import nu.marginalia.index.journal.writer.IndexJournalWriter;
@ -18,9 +18,7 @@ import org.slf4j.LoggerFactory;
import java.io.IOException;
import java.nio.file.Files;
import java.sql.SQLException;
import static nu.marginalia.index.journal.model.IndexJournalEntryData.MAX_LENGTH;
@Singleton
public class LoaderIndexJournalWriter {
@ -28,12 +26,11 @@ public class LoaderIndexJournalWriter {
private final IndexJournalWriter indexWriter;
private static final Logger logger = LoggerFactory.getLogger(LoaderIndexJournalWriter.class);
private final MurmurHash3_128 hasher = new MurmurHash3_128();
private final long[] buffer = new long[MAX_LENGTH * 2];
private final long[] buffer = new long[65536];
@Inject
public LoaderIndexJournalWriter(FileStorageService fileStorageService) throws IOException, SQLException {
public LoaderIndexJournalWriter(FileStorageService fileStorageService) throws IOException {
var indexArea = IndexLocations.getIndexConstructionArea(fileStorageService);
var existingIndexFiles = IndexJournalFileNames.findJournalFiles(indexArea);
@ -68,26 +65,10 @@ public class LoaderIndexJournalWriter {
return;
}
var pointer = wordSet.newPointer();
while (pointer.hasMore()) {
int i = 0;
while (i < buffer.length
&& pointer.advancePointer())
{
final long hashedKeyword = hasher.hashKeyword(pointer.getKeyword());
buffer[i++] = hashedKeyword;
buffer[i++] = pointer.getMetadata();
}
var entry = new IndexJournalEntryData(i, buffer);
var header = new IndexJournalEntryHeader(combinedId, features, metadata);
indexWriter.put(header, entry);
}
var header = new IndexJournalEntryHeader(combinedId, features, metadata);
var data = new IndexJournalEntryData(wordSet.keywords, wordSet.metadata, wordSet.positions);
indexWriter.put(header, data);
}
public void close() throws Exception {

View File

@ -1,87 +0,0 @@
package nu.marginalia.loading.loader;
import nu.marginalia.sequence.GammaCodedSequence;
import nu.marginalia.storage.FileStorageService;
import nu.marginalia.storage.model.FileStorageBase;
import nu.marginalia.storage.model.FileStorageBaseType;
import nu.marginalia.index.journal.reader.IndexJournalReaderSingleFile;
import nu.marginalia.keyword.model.DocumentKeywords;
import nu.marginalia.loading.LoaderIndexJournalWriter;
import nu.marginalia.model.idx.DocumentMetadata;
import nu.marginalia.index.journal.IndexJournalFileNames;
import org.junit.jupiter.api.AfterEach;
import org.junit.jupiter.api.BeforeEach;
import org.junit.jupiter.api.Test;
import org.mockito.Mockito;
import java.io.IOException;
import java.nio.ByteBuffer;
import java.nio.file.Files;
import java.nio.file.Path;
import java.sql.SQLException;
import java.util.ArrayList;
import java.util.List;
import java.util.stream.LongStream;
import static org.junit.jupiter.api.Assertions.*;
class LoaderIndexJournalWriterTest {
Path tempDir;
LoaderIndexJournalWriter writer;
@BeforeEach
public void setUp() throws IOException, SQLException {
tempDir = Files.createTempDirectory(getClass().getSimpleName());
FileStorageService storageService = Mockito.mock(FileStorageService.class);
Mockito.when(storageService.getStorageBase(FileStorageBaseType.CURRENT)).thenReturn(new FileStorageBase(null, null, 1,null, tempDir.toString()));
writer = new LoaderIndexJournalWriter(storageService);
}
@AfterEach
public void tearDown() throws Exception {
writer.close();
List<Path> junk = Files.list(tempDir.resolve("iw")).toList();
for (var item : junk)
Files.delete(item);
Files.delete(tempDir.resolve("iw"));
Files.delete(tempDir);
}
@Test
public void testBreakup() throws Exception {
String[] keywords = new String[2000];
long[] metadata = new long[2000];
GammaCodedSequence[] positions = new GammaCodedSequence[2000];
ByteBuffer workArea = ByteBuffer.allocate(1024);
for (int i = 0; i < 2000; i++) {
keywords[i] = Integer.toString(i);
metadata[i] = i+1;
positions[i] = GammaCodedSequence.generate(workArea, 1, 2, 3);
}
DocumentKeywords words = new DocumentKeywords(keywords, metadata, positions);
writer.putWords(1, 0, new DocumentMetadata(0),
words);
writer.close();
List<Path> journalFiles = IndexJournalFileNames.findJournalFiles(tempDir.resolve("iw"));
assertEquals(1, journalFiles.size());
var reader = new IndexJournalReaderSingleFile(journalFiles.get(0));
List<Long> docIds = new ArrayList<>();
reader.forEachDocId(docIds::add);
assertEquals(List.of(1L, 1L), docIds);
List<Long> metas = new ArrayList<Long>();
var ptr = reader.newPointer();
while (ptr.nextDocument()) {
while (ptr.nextRecord()) {
metas.add(ptr.wordMeta());
}
}
assertEquals(LongStream.of(metadata).boxed().toList(), metas);
}
}

View File

@ -33,6 +33,7 @@ public class SearchMain extends MainClass {
new ServiceDiscoveryModule(),
new DatabaseModule(false)
);
// Orchestrate the boot order for the services
var registry = injector.getInstance(ServiceRegistryIf.class);