mirror of
https://github.com/MarginaliaSearch/MarginaliaSearch.git
synced 2025-02-23 13:09:00 +00:00
(index, WIP) Position data partially integrated with forward and reverse indexes.
There's no graceful way of doing this in small commits, pushing to avoid the risk of data loss.
This commit is contained in:
parent
9b922af075
commit
4a8afa6b9f
@ -1,6 +1,5 @@
|
||||
package nu.marginalia.keyword.model;
|
||||
|
||||
import nu.marginalia.model.idx.WordMetadata;
|
||||
import nu.marginalia.sequence.GammaCodedSequence;
|
||||
|
||||
import java.io.Serial;
|
||||
@ -26,26 +25,6 @@ public final class DocumentKeywords implements Serializable {
|
||||
assert keywords.length == metadata.length;
|
||||
}
|
||||
|
||||
@Override
|
||||
public String toString() {
|
||||
StringBuilder sb = new StringBuilder();
|
||||
sb.append(getClass().getSimpleName());
|
||||
sb.append('[');
|
||||
var pointer = newPointer();
|
||||
while (pointer.advancePointer()) {
|
||||
sb.append("\n\t ");
|
||||
|
||||
long metadata = pointer.getMetadata();
|
||||
String keyword = pointer.getKeyword();
|
||||
sb.append(keyword);
|
||||
|
||||
if (metadata != 0) {
|
||||
sb.append("/").append(new WordMetadata(metadata));
|
||||
}
|
||||
}
|
||||
return sb.append("\n]").toString();
|
||||
}
|
||||
|
||||
public boolean isEmpty() {
|
||||
return keywords.length == 0;
|
||||
}
|
||||
@ -54,11 +33,6 @@ public final class DocumentKeywords implements Serializable {
|
||||
return keywords.length;
|
||||
}
|
||||
|
||||
/** Return a pointer for traversing this structure */
|
||||
public DocumentKeywordsPointer newPointer() {
|
||||
return new DocumentKeywordsPointer(this);
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
|
||||
|
@ -41,7 +41,7 @@ public class DocumentKeywordsBuilder {
|
||||
|
||||
meta[i] = entry.getLongValue();
|
||||
wordArray[i] = entry.getKey();
|
||||
positions[i] = GammaCodedSequence.generate(workArea, wordToPos.get(entry.getKey()));
|
||||
positions[i] = GammaCodedSequence.generate(workArea, wordToPos.getOrDefault(entry.getKey(), IntList.of()));
|
||||
}
|
||||
|
||||
return new DocumentKeywords(wordArray, meta, positions);
|
||||
|
@ -1,48 +0,0 @@
|
||||
package nu.marginalia.keyword.model;
|
||||
|
||||
import nu.marginalia.sequence.GammaCodedSequence;
|
||||
|
||||
/** Pointer into a {@see DocumentKeywords}. It starts out before the first position,
|
||||
* forward with advancePointer().
|
||||
* */
|
||||
public class DocumentKeywordsPointer {
|
||||
private int pos = -1;
|
||||
|
||||
private final DocumentKeywords keywords;
|
||||
|
||||
DocumentKeywordsPointer(DocumentKeywords keywords) {
|
||||
this.keywords = keywords;
|
||||
}
|
||||
|
||||
/** Number of positions remaining */
|
||||
public int remaining() {
|
||||
return keywords.size() - Math.max(0, pos);
|
||||
}
|
||||
|
||||
/** Return the keyword associated with the current position */
|
||||
public String getKeyword() {
|
||||
return keywords.keywords[pos];
|
||||
}
|
||||
|
||||
/** Return the metadata associated with the current position */
|
||||
public long getMetadata() {
|
||||
return keywords.metadata[pos];
|
||||
}
|
||||
|
||||
/** Return the positions associated with the current position */
|
||||
public GammaCodedSequence getPositions() {
|
||||
return keywords.positions[pos];
|
||||
}
|
||||
|
||||
/** Advance the current position,
|
||||
* returns false if this was the
|
||||
* last position */
|
||||
public boolean advancePointer() {
|
||||
return ++pos < keywords.size();
|
||||
}
|
||||
|
||||
/** Returns true unless the pointer is beyond the last position in the keyword set */
|
||||
public boolean hasMore() {
|
||||
return pos + 1 < keywords.size();
|
||||
}
|
||||
}
|
@ -92,26 +92,17 @@ class DocumentKeywordExtractorTest {
|
||||
);
|
||||
|
||||
var keywordsBuilt = keywords.build();
|
||||
var ptr = keywordsBuilt.newPointer();
|
||||
|
||||
Map<String, WordMetadata> flags = new HashMap<>();
|
||||
Map<String, GammaCodedSequence> positions = new HashMap<>();
|
||||
|
||||
ByteBuffer work = ByteBuffer.allocate(1024);
|
||||
for (int i = 0; i < keywordsBuilt.size(); i++) {
|
||||
String keyword = keywordsBuilt.keywords[i];
|
||||
long metadata = keywordsBuilt.metadata[i];
|
||||
|
||||
while (ptr.advancePointer()) {
|
||||
System.out.println(ptr.getKeyword() + " " + ptr.getMetadata() + " " + ptr.getPositions());
|
||||
|
||||
int[] vals = ptr.getPositions().decode().toIntArray();
|
||||
for (int i = 0; i < vals.length; i++) {
|
||||
vals[i] = vals[i] + 1;
|
||||
}
|
||||
var out = EliasGammaCodec.encode(work, vals);
|
||||
System.out.println(out.capacity() + "/" + vals.length * 4);
|
||||
|
||||
if (Set.of("dirty", "blues").contains(ptr.getKeyword())) {
|
||||
flags.put(ptr.getKeyword(), new WordMetadata(ptr.getMetadata()));
|
||||
positions.put(ptr.getKeyword(), ptr.getPositions());
|
||||
if (Set.of("dirty", "blues").contains(keyword)) {
|
||||
flags.put(keyword, new WordMetadata(metadata));
|
||||
positions.put(keyword, keywordsBuilt.positions[i]);
|
||||
|
||||
}
|
||||
}
|
||||
|
@ -15,12 +15,14 @@ dependencies {
|
||||
implementation 'org.jgrapht:jgrapht-core:1.5.2'
|
||||
|
||||
implementation project(':third-party:commons-codec')
|
||||
implementation project(':third-party:parquet-floor')
|
||||
|
||||
implementation project(':code:index:api')
|
||||
implementation project(':code:functions:link-graph:api')
|
||||
|
||||
implementation project(':code:libraries:array')
|
||||
implementation project(':code:libraries:btree')
|
||||
implementation project(':code:libraries:coded-sequence')
|
||||
|
||||
implementation project(':code:common:db')
|
||||
implementation project(':code:common:config')
|
||||
|
@ -15,6 +15,7 @@ apply from: "$rootProject.projectDir/srcsets.gradle"
|
||||
dependencies {
|
||||
implementation project(':code:libraries:array')
|
||||
implementation project(':code:libraries:btree')
|
||||
implementation project(':code:libraries:coded-sequence')
|
||||
implementation project(':code:index:query')
|
||||
implementation project(':code:index:index-journal')
|
||||
implementation project(':code:common:model')
|
||||
|
@ -2,12 +2,14 @@ package nu.marginalia.index.forward;
|
||||
|
||||
import lombok.SneakyThrows;
|
||||
import nu.marginalia.index.domainrankings.DomainRankings;
|
||||
import nu.marginalia.index.journal.model.IndexJournalEntry;
|
||||
import nu.marginalia.index.journal.model.IndexJournalEntryData;
|
||||
import nu.marginalia.index.journal.model.IndexJournalEntryHeader;
|
||||
import nu.marginalia.index.journal.reader.IndexJournalReaderSingleFile;
|
||||
import nu.marginalia.index.journal.writer.IndexJournalWriter;
|
||||
import nu.marginalia.index.journal.writer.IndexJournalWriterSingleFileImpl;
|
||||
import nu.marginalia.model.id.UrlIdCodec;
|
||||
import nu.marginalia.process.control.FakeProcessHeartbeat;
|
||||
import nu.marginalia.sequence.GammaCodedSequence;
|
||||
import nu.marginalia.test.TestUtil;
|
||||
import org.junit.jupiter.api.AfterEach;
|
||||
import org.junit.jupiter.api.BeforeEach;
|
||||
@ -69,40 +71,40 @@ class ForwardIndexConverterTest {
|
||||
TestUtil.clearTempDir(dataDir);
|
||||
}
|
||||
|
||||
public int[] getFactorsI(int id) {
|
||||
return IntStream.rangeClosed(1, id).filter(v -> (id % v) == 0).toArray();
|
||||
}
|
||||
|
||||
long createId(long url, long domain) {
|
||||
return UrlIdCodec.encodeId((int) domain, (int) url);
|
||||
}
|
||||
|
||||
public void createEntry(IndexJournalWriter writer, int id) {
|
||||
int[] factors = getFactorsI(id);
|
||||
|
||||
var entryBuilder = IndexJournalEntry.builder(createId(id, id/20), id%5);
|
||||
|
||||
for (int i = 0; i+1 < factors.length; i+=2) {
|
||||
entryBuilder.add(factors[i], -factors[i+1]);
|
||||
}
|
||||
|
||||
writer.put(entryBuilder.build());
|
||||
writer.put(
|
||||
new IndexJournalEntryHeader(createId(id, id/20),
|
||||
id%3,
|
||||
(id % 5)),
|
||||
new IndexJournalEntryData(
|
||||
new String[]{},
|
||||
new long[]{},
|
||||
new GammaCodedSequence[]{}
|
||||
)
|
||||
);
|
||||
}
|
||||
|
||||
@Test
|
||||
void testForwardIndex() throws IOException {
|
||||
|
||||
new ForwardIndexConverter(new FakeProcessHeartbeat(), new IndexJournalReaderSingleFile(indexFile), docsFileId, docsFileData, new DomainRankings()).convert();
|
||||
new ForwardIndexConverter(new FakeProcessHeartbeat(),
|
||||
new IndexJournalReaderSingleFile(indexFile),
|
||||
docsFileId,
|
||||
docsFileData,
|
||||
new DomainRankings()).convert();
|
||||
|
||||
var forwardReader = new ForwardIndexReader(docsFileId, docsFileData);
|
||||
|
||||
for (int i = 36; i < workSetSize; i++) {
|
||||
long docId = createId(i, i/20);
|
||||
assertEquals(0x00FF000000000000L | (i % 5), forwardReader.getDocMeta(docId));
|
||||
assertEquals((i % 3), forwardReader.getHtmlFeatures(docId));
|
||||
assertEquals(i/20, UrlIdCodec.getDomainId(docId));
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
|
||||
}
|
@ -13,8 +13,11 @@ java {
|
||||
apply from: "$rootProject.projectDir/srcsets.gradle"
|
||||
|
||||
dependencies {
|
||||
implementation project(':code:libraries:coded-sequence')
|
||||
implementation project(':code:libraries:array')
|
||||
implementation project(':code:common:model')
|
||||
implementation project(':third-party:parquet-floor')
|
||||
implementation project(':third-party:commons-codec')
|
||||
|
||||
implementation libs.bundles.slf4j
|
||||
|
||||
@ -23,6 +26,7 @@ dependencies {
|
||||
implementation libs.guava
|
||||
implementation libs.trove
|
||||
implementation libs.zstd
|
||||
implementation libs.fastutil
|
||||
implementation libs.commons.lang3
|
||||
implementation libs.roaringbitmap
|
||||
|
||||
|
@ -1,27 +0,0 @@
|
||||
package nu.marginalia.index.journal.model;
|
||||
|
||||
import nu.marginalia.model.id.UrlIdCodec;
|
||||
|
||||
/** An entry in the index journal.
|
||||
*
|
||||
* @param header the header of the entry, containing document level data
|
||||
* @param data the data of the entry, containing keyword level data
|
||||
*
|
||||
* @see IndexJournalEntryHeader
|
||||
* @see IndexJournalEntryData
|
||||
*/
|
||||
public record IndexJournalEntry(IndexJournalEntryHeader header, IndexJournalEntryData data) {
|
||||
|
||||
public static IndexJournalEntryBuilder builder(long documentId, long documentMeta) {
|
||||
return new IndexJournalEntryBuilder(0, documentId, documentMeta);
|
||||
}
|
||||
|
||||
public static IndexJournalEntryBuilder builder(int domainId,
|
||||
int urlId,
|
||||
long documentMeta) {
|
||||
|
||||
|
||||
return builder(UrlIdCodec.encodeId(domainId, urlId), documentMeta);
|
||||
}
|
||||
|
||||
}
|
@ -1,37 +0,0 @@
|
||||
package nu.marginalia.index.journal.model;
|
||||
|
||||
import gnu.trove.list.array.TLongArrayList;
|
||||
|
||||
public class IndexJournalEntryBuilder {
|
||||
private final long documentId;
|
||||
private final int documentFeatures;
|
||||
private final long documentMeta;
|
||||
private final TLongArrayList items = new TLongArrayList();
|
||||
|
||||
public IndexJournalEntryBuilder(
|
||||
int documentFeatures,
|
||||
long documentId,
|
||||
long documentMeta) {
|
||||
this.documentFeatures = documentFeatures;
|
||||
this.documentId = documentId;
|
||||
this.documentMeta = documentMeta;
|
||||
}
|
||||
|
||||
public IndexJournalEntryBuilder add(long wordId, long metadata) {
|
||||
|
||||
items.add(wordId);
|
||||
items.add(metadata);
|
||||
|
||||
return this;
|
||||
}
|
||||
|
||||
public IndexJournalEntry build() {
|
||||
return new IndexJournalEntry(
|
||||
new IndexJournalEntryHeader(items.size(),
|
||||
documentFeatures,
|
||||
documentId,
|
||||
documentMeta),
|
||||
new IndexJournalEntryData(items.toArray())
|
||||
);
|
||||
}
|
||||
}
|
@ -1,77 +1,36 @@
|
||||
package nu.marginalia.index.journal.model;
|
||||
|
||||
import nu.marginalia.index.journal.reader.IndexJournalReader;
|
||||
import nu.marginalia.model.idx.WordMetadata;
|
||||
import nu.marginalia.hash.MurmurHash3_128;
|
||||
import nu.marginalia.sequence.GammaCodedSequence;
|
||||
|
||||
import java.util.Arrays;
|
||||
import java.util.Iterator;
|
||||
public record IndexJournalEntryData(long[] termIds,
|
||||
long[] metadata,
|
||||
GammaCodedSequence[] positions) {
|
||||
|
||||
/** The keyword data of an index journal entry.
|
||||
* The data itself is an interleaved array of
|
||||
* word ids and metadata.
|
||||
* <p>
|
||||
* Odd entries are term ids, even entries are encoded WordMetadata records.
|
||||
* </p>
|
||||
* <p>The civilized way of reading the journal data is to use an IndexJournalReader</p>
|
||||
*
|
||||
* @see WordMetadata
|
||||
* @see IndexJournalReader
|
||||
*/
|
||||
public class IndexJournalEntryData implements Iterable<IndexJournalEntryData.Record> {
|
||||
private final int size;
|
||||
public final long[] underlyingArray;
|
||||
|
||||
public static final int MAX_LENGTH = 1000;
|
||||
public static final int ENTRY_SIZE = 2;
|
||||
|
||||
public IndexJournalEntryData(long[] underlyingArray) {
|
||||
this.size = underlyingArray.length;
|
||||
this.underlyingArray = underlyingArray;
|
||||
public IndexJournalEntryData {
|
||||
assert termIds.length == metadata.length;
|
||||
assert termIds.length == positions.length;
|
||||
}
|
||||
|
||||
public IndexJournalEntryData(int size, long[] underlyingArray) {
|
||||
this.size = size;
|
||||
this.underlyingArray = underlyingArray;
|
||||
public IndexJournalEntryData(String[] keywords,
|
||||
long[] metadata,
|
||||
GammaCodedSequence[] positions)
|
||||
{
|
||||
this(termIds(keywords), metadata, positions);
|
||||
}
|
||||
|
||||
public long get(int idx) {
|
||||
if (idx >= size)
|
||||
throw new ArrayIndexOutOfBoundsException(idx + " vs " + size);
|
||||
return underlyingArray[idx];
|
||||
}
|
||||
private static final MurmurHash3_128 hash = new MurmurHash3_128();
|
||||
|
||||
public int size() {
|
||||
return size;
|
||||
}
|
||||
public long[] toArray() {
|
||||
if (size == underlyingArray.length)
|
||||
return underlyingArray;
|
||||
else
|
||||
return Arrays.copyOf(underlyingArray, size);
|
||||
return termIds.length;
|
||||
}
|
||||
|
||||
public String toString() {
|
||||
return String.format("%s[%s]", getClass().getSimpleName(), Arrays.toString(toArray()));
|
||||
}
|
||||
|
||||
public Iterator<Record> iterator() {
|
||||
return new EntryIterator();
|
||||
}
|
||||
|
||||
private class EntryIterator implements Iterator<Record> {
|
||||
int pos = -ENTRY_SIZE;
|
||||
|
||||
public boolean hasNext() {
|
||||
return pos + 2*ENTRY_SIZE - 1 < size;
|
||||
}
|
||||
|
||||
@Override
|
||||
public Record next() {
|
||||
pos+=ENTRY_SIZE;
|
||||
|
||||
return new Record(underlyingArray[pos], underlyingArray[pos+1]);
|
||||
private static long[] termIds(String[] keywords) {
|
||||
long[] termIds = new long[keywords.length];
|
||||
for (int i = 0; i < keywords.length; i++) {
|
||||
termIds[i] = hash.hashKeyword(keywords[i]);
|
||||
}
|
||||
return termIds;
|
||||
}
|
||||
|
||||
public record Record(long wordId, long metadata) {}
|
||||
}
|
||||
|
@ -0,0 +1,20 @@
|
||||
package nu.marginalia.index.journal.model;
|
||||
|
||||
import nu.marginalia.sequence.GammaCodedSequence;
|
||||
|
||||
/** Data corresponding to a term in a document in the index journal.
|
||||
*
|
||||
* @param termId the id of the term
|
||||
* @param metadata the metadata of the term
|
||||
* @param positions the positions of the word in the document, gamma coded
|
||||
*
|
||||
* @see GammaCodedSequence
|
||||
*/
|
||||
public record IndexJournalEntryTermData(
|
||||
long termId,
|
||||
long metadata,
|
||||
GammaCodedSequence positions)
|
||||
{
|
||||
|
||||
|
||||
}
|
@ -1,35 +1,29 @@
|
||||
package nu.marginalia.index.journal.reader;
|
||||
|
||||
import nu.marginalia.index.journal.model.IndexJournalEntryData;
|
||||
import nu.marginalia.index.journal.model.IndexJournalEntryHeader;
|
||||
import nu.marginalia.index.journal.model.IndexJournalEntryTermData;
|
||||
import nu.marginalia.model.id.UrlIdCodec;
|
||||
import nu.marginalia.sequence.GammaCodedSequence;
|
||||
|
||||
import java.io.DataInputStream;
|
||||
import java.io.IOException;
|
||||
import java.nio.ByteBuffer;
|
||||
import java.nio.LongBuffer;
|
||||
import java.util.Iterator;
|
||||
|
||||
public class IndexJournalReadEntry {
|
||||
public class IndexJournalReadEntry implements Iterable<IndexJournalEntryTermData> {
|
||||
public final IndexJournalEntryHeader header;
|
||||
|
||||
private final long[] buffer;
|
||||
private final ByteBuffer buffer;
|
||||
private final int initialPos;
|
||||
|
||||
public IndexJournalReadEntry(IndexJournalEntryHeader header, long[] buffer) {
|
||||
public IndexJournalReadEntry(IndexJournalEntryHeader header, ByteBuffer buffer) {
|
||||
this.header = header;
|
||||
this.buffer = buffer;
|
||||
this.initialPos = buffer.position();
|
||||
}
|
||||
|
||||
|
||||
record WorkArea(byte[] bytes, LongBuffer buffer) {
|
||||
WorkArea(byte[] bytes) {
|
||||
this(bytes, ByteBuffer.wrap(bytes).asLongBuffer());
|
||||
}
|
||||
WorkArea() {
|
||||
this(new byte[8*65536]);
|
||||
}
|
||||
}
|
||||
|
||||
static ThreadLocal<WorkArea> pool = ThreadLocal.withInitial(WorkArea::new);
|
||||
static ThreadLocal<ByteBuffer> pool = ThreadLocal.withInitial(() -> ByteBuffer.allocate(8*65536));
|
||||
|
||||
public static IndexJournalReadEntry read(DataInputStream inputStream) throws IOException {
|
||||
|
||||
@ -44,13 +38,11 @@ public class IndexJournalReadEntry {
|
||||
meta);
|
||||
|
||||
var workArea = pool.get();
|
||||
inputStream.readFully(workArea.bytes, 0, 8 * header.entrySize());
|
||||
|
||||
long[] out = new long[header.entrySize()];
|
||||
workArea.buffer.get(0, out, 0, out.length);
|
||||
|
||||
return new IndexJournalReadEntry(header, out);
|
||||
inputStream.readFully(workArea.array(), 0, header.entrySize());
|
||||
workArea.position(0);
|
||||
workArea.limit(header.entrySize());
|
||||
|
||||
return new IndexJournalReadEntry(header, workArea);
|
||||
}
|
||||
|
||||
public long docId() {
|
||||
@ -61,12 +53,54 @@ public class IndexJournalReadEntry {
|
||||
return header.documentMeta();
|
||||
}
|
||||
|
||||
public int documentFeatures() {
|
||||
return header.documentFeatures();
|
||||
}
|
||||
|
||||
public int domainId() {
|
||||
return UrlIdCodec.getDomainId(docId());
|
||||
}
|
||||
|
||||
public IndexJournalEntryData readEntry() {
|
||||
return new IndexJournalEntryData(header.entrySize(), buffer);
|
||||
public void reset() {
|
||||
buffer.position(initialPos);
|
||||
}
|
||||
|
||||
public Iterator<IndexJournalEntryTermData> iterator() {
|
||||
return new TermDataIterator(buffer, initialPos);
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
class TermDataIterator implements Iterator<IndexJournalEntryTermData> {
|
||||
private final ByteBuffer buffer;
|
||||
|
||||
TermDataIterator(ByteBuffer buffer, int initialPos) {
|
||||
this.buffer = buffer;
|
||||
this.buffer.position(initialPos);
|
||||
}
|
||||
|
||||
@Override
|
||||
public boolean hasNext() {
|
||||
return buffer.position() < buffer.limit();
|
||||
}
|
||||
|
||||
@Override
|
||||
public IndexJournalEntryTermData next() {
|
||||
// read the metadata for the term
|
||||
long termId = buffer.getLong();
|
||||
long meta = buffer.getLong();
|
||||
|
||||
// read the size of the sequence data
|
||||
int size = buffer.get() & 0xFF;
|
||||
|
||||
// slice the buffer to get the sequence data
|
||||
var slice = buffer.slice(buffer.position(), size);
|
||||
var sequence = new GammaCodedSequence(slice);
|
||||
|
||||
// advance the buffer position to the next term
|
||||
buffer.position(buffer.position() + size);
|
||||
|
||||
return new IndexJournalEntryTermData(termId, meta, sequence);
|
||||
}
|
||||
|
||||
}
|
@ -12,6 +12,9 @@ public interface IndexJournalReader {
|
||||
int FILE_HEADER_SIZE_LONGS = 2;
|
||||
int FILE_HEADER_SIZE_BYTES = 8 * FILE_HEADER_SIZE_LONGS;
|
||||
|
||||
int DOCUMENT_HEADER_SIZE_BYTES = 24;
|
||||
int TERM_HEADER_SIZE_BYTES = 17;
|
||||
|
||||
/** Create a reader for a single file. */
|
||||
static IndexJournalReader singleFile(Path fileName) throws IOException {
|
||||
return new IndexJournalReaderSingleFile(fileName);
|
||||
@ -25,22 +28,23 @@ public interface IndexJournalReader {
|
||||
default void forEachWordId(LongConsumer consumer) {
|
||||
var ptr = this.newPointer();
|
||||
while (ptr.nextDocument()) {
|
||||
while (ptr.nextRecord()) {
|
||||
consumer.accept(ptr.wordId());
|
||||
for (var termData : ptr) {
|
||||
consumer.accept(termData.termId());
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
default void forEachDocId(LongConsumer consumer) {
|
||||
var ptr = this.newPointer();
|
||||
while (ptr.nextDocument()) {
|
||||
consumer.accept(ptr.documentId());
|
||||
default void forEachDocId(LongConsumer consumer) throws IOException {
|
||||
try (var ptr = this.newPointer()) {
|
||||
while (ptr.nextDocument()) {
|
||||
consumer.accept(ptr.documentId());
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/** Create a new pointer to the journal. The IndexJournalPointer is
|
||||
* a two-tiered iterator that allows both iteration over document records
|
||||
* and their keywords
|
||||
* and the terms within each document.
|
||||
*/
|
||||
IndexJournalPointer newPointer();
|
||||
|
||||
|
@ -16,12 +16,15 @@ public class IndexJournalReaderPagingImpl implements IndexJournalReader {
|
||||
private final List<IndexJournalReader> readers;
|
||||
|
||||
public IndexJournalReaderPagingImpl(Path baseDir) throws IOException {
|
||||
var inputFiles = IndexJournalFileNames.findJournalFiles(baseDir);
|
||||
if (inputFiles.isEmpty())
|
||||
this(IndexJournalFileNames.findJournalFiles(baseDir));
|
||||
|
||||
if (readers.isEmpty())
|
||||
logger.warn("Creating paging index journal file in {}, found no inputs!", baseDir);
|
||||
else
|
||||
logger.info("Creating paging index journal reader for {} inputs", inputFiles.size());
|
||||
logger.info("Creating paging index journal reader for {} inputs", readers.size());
|
||||
}
|
||||
|
||||
public IndexJournalReaderPagingImpl(List<Path> inputFiles) throws IOException {
|
||||
this.readers = new ArrayList<>(inputFiles.size());
|
||||
|
||||
for (var inputFile : inputFiles) {
|
||||
|
@ -2,18 +2,20 @@ package nu.marginalia.index.journal.reader;
|
||||
|
||||
import com.github.luben.zstd.ZstdInputStream;
|
||||
import lombok.SneakyThrows;
|
||||
import nu.marginalia.index.journal.model.IndexJournalEntryData;
|
||||
import nu.marginalia.index.journal.model.IndexJournalEntryTermData;
|
||||
import nu.marginalia.index.journal.model.IndexJournalFileHeader;
|
||||
import nu.marginalia.index.journal.reader.pointer.IndexJournalPointer;
|
||||
import org.jetbrains.annotations.NotNull;
|
||||
|
||||
import java.io.*;
|
||||
import java.nio.file.Files;
|
||||
import java.nio.file.Path;
|
||||
import java.nio.file.StandardOpenOption;
|
||||
import java.util.Iterator;
|
||||
|
||||
public class IndexJournalReaderSingleFile implements IndexJournalReader {
|
||||
|
||||
private Path journalFile;
|
||||
private final Path journalFile;
|
||||
public final IndexJournalFileHeader fileHeader;
|
||||
|
||||
@Override
|
||||
@ -58,8 +60,6 @@ class SingleFileJournalPointer implements IndexJournalPointer {
|
||||
private final IndexJournalFileHeader fileHeader;
|
||||
private final DataInputStream dataInputStream;
|
||||
private IndexJournalReadEntry entry;
|
||||
private IndexJournalEntryData entryData;
|
||||
private int recordIdx = -2;
|
||||
private int docIdx = -1;
|
||||
|
||||
public SingleFileJournalPointer(
|
||||
@ -73,9 +73,6 @@ class SingleFileJournalPointer implements IndexJournalPointer {
|
||||
@SneakyThrows
|
||||
@Override
|
||||
public boolean nextDocument() {
|
||||
recordIdx = -2;
|
||||
entryData = null;
|
||||
|
||||
if (++docIdx < fileHeader.fileSizeRecords()) {
|
||||
entry = IndexJournalReadEntry.read(dataInputStream);
|
||||
return true;
|
||||
@ -86,19 +83,6 @@ class SingleFileJournalPointer implements IndexJournalPointer {
|
||||
return false;
|
||||
}
|
||||
|
||||
@Override
|
||||
public boolean nextRecord() {
|
||||
if (entryData == null) {
|
||||
entryData = entry.readEntry();
|
||||
}
|
||||
|
||||
recordIdx += 2;
|
||||
if (recordIdx < entryData.size()) {
|
||||
return true;
|
||||
}
|
||||
return false;
|
||||
}
|
||||
|
||||
@Override
|
||||
public long documentId() {
|
||||
return entry.docId();
|
||||
@ -109,22 +93,21 @@ class SingleFileJournalPointer implements IndexJournalPointer {
|
||||
return entry.docMeta();
|
||||
}
|
||||
|
||||
|
||||
@Override
|
||||
public long wordId() {
|
||||
return entryData.get(recordIdx);
|
||||
public int documentFeatures() { return entry.documentFeatures(); }
|
||||
|
||||
/** Return an iterator over the terms in the current document.
|
||||
* This iterator is not valid after calling nextDocument().
|
||||
*/
|
||||
@NotNull
|
||||
@Override
|
||||
public Iterator<IndexJournalEntryTermData> iterator() {
|
||||
return entry.iterator();
|
||||
}
|
||||
|
||||
@Override
|
||||
public long wordMeta() {
|
||||
return entryData.get(recordIdx + 1);
|
||||
}
|
||||
|
||||
@Override
|
||||
public int documentFeatures() {
|
||||
if (entryData == null) {
|
||||
entryData = entry.readEntry();
|
||||
}
|
||||
|
||||
return entry.header.documentFeatures();
|
||||
public void close() throws IOException {
|
||||
dataInputStream.close();
|
||||
}
|
||||
}
|
@ -1,5 +1,10 @@
|
||||
package nu.marginalia.index.journal.reader.pointer;
|
||||
|
||||
import nu.marginalia.index.journal.model.IndexJournalEntryTermData;
|
||||
import org.jetbrains.annotations.NotNull;
|
||||
|
||||
import java.io.IOException;
|
||||
import java.util.Iterator;
|
||||
import java.util.function.LongPredicate;
|
||||
|
||||
/**
|
||||
@ -13,7 +18,7 @@ import java.util.function.LongPredicate;
|
||||
* nextDocument() will move the pointer from doc1 to doc2;<br>
|
||||
* nextRecord() will move the pointer from word1 to word2...<br>
|
||||
*/
|
||||
public interface IndexJournalPointer {
|
||||
public interface IndexJournalPointer extends Iterable<IndexJournalEntryTermData>, AutoCloseable {
|
||||
/**
|
||||
* Advance to the next document in the journal,
|
||||
* returning true if such a document exists.
|
||||
@ -22,11 +27,6 @@ public interface IndexJournalPointer {
|
||||
*/
|
||||
boolean nextDocument();
|
||||
|
||||
/**
|
||||
* Advance to the next record in the journal
|
||||
*/
|
||||
boolean nextRecord();
|
||||
|
||||
/**
|
||||
* Get the id associated with the current document
|
||||
*/
|
||||
@ -37,16 +37,6 @@ public interface IndexJournalPointer {
|
||||
*/
|
||||
long documentMeta();
|
||||
|
||||
/**
|
||||
* Get the wordId associated with the current record
|
||||
*/
|
||||
long wordId();
|
||||
|
||||
/**
|
||||
* Get the termMeta associated with the current record
|
||||
*/
|
||||
long wordMeta();
|
||||
|
||||
/**
|
||||
* Get the documentFeatures associated with the current record
|
||||
*/
|
||||
@ -64,6 +54,8 @@ public interface IndexJournalPointer {
|
||||
default IndexJournalPointer filterWordMeta(LongPredicate filter) {
|
||||
return new FilteringJournalPointer(this, filter);
|
||||
}
|
||||
|
||||
void close() throws IOException;
|
||||
}
|
||||
|
||||
class JoiningJournalPointer implements IndexJournalPointer {
|
||||
@ -86,11 +78,6 @@ class JoiningJournalPointer implements IndexJournalPointer {
|
||||
return false;
|
||||
}
|
||||
|
||||
@Override
|
||||
public boolean nextRecord() {
|
||||
return pointers[pIndex].nextRecord();
|
||||
}
|
||||
|
||||
@Override
|
||||
public long documentId() {
|
||||
return pointers[pIndex].documentId();
|
||||
@ -101,20 +88,28 @@ class JoiningJournalPointer implements IndexJournalPointer {
|
||||
return pointers[pIndex].documentMeta();
|
||||
}
|
||||
|
||||
@Override
|
||||
public long wordId() {
|
||||
return pointers[pIndex].wordId();
|
||||
}
|
||||
|
||||
@Override
|
||||
public long wordMeta() {
|
||||
return pointers[pIndex].wordMeta();
|
||||
}
|
||||
|
||||
@Override
|
||||
public int documentFeatures() {
|
||||
return pointers[pIndex].documentFeatures();
|
||||
}
|
||||
|
||||
@NotNull
|
||||
@Override
|
||||
public Iterator<IndexJournalEntryTermData> iterator() {
|
||||
return pointers[pIndex].iterator();
|
||||
}
|
||||
|
||||
public void close() {
|
||||
for (var p : pointers) {
|
||||
try {
|
||||
p.close();
|
||||
} catch (Exception e) {
|
||||
e.printStackTrace();
|
||||
}
|
||||
}
|
||||
|
||||
}
|
||||
}
|
||||
|
||||
class FilteringJournalPointer implements IndexJournalPointer {
|
||||
@ -128,14 +123,10 @@ class FilteringJournalPointer implements IndexJournalPointer {
|
||||
|
||||
@Override
|
||||
public boolean nextDocument() {
|
||||
return base.nextDocument();
|
||||
}
|
||||
|
||||
@Override
|
||||
public boolean nextRecord() {
|
||||
while (base.nextRecord()) {
|
||||
if (filter.test(wordMeta()))
|
||||
while (base.nextDocument()) {
|
||||
if (iterator().hasNext()) {
|
||||
return true;
|
||||
}
|
||||
}
|
||||
return false;
|
||||
}
|
||||
@ -150,18 +141,49 @@ class FilteringJournalPointer implements IndexJournalPointer {
|
||||
return base.documentMeta();
|
||||
}
|
||||
|
||||
@Override
|
||||
public long wordId() {
|
||||
return base.wordId();
|
||||
}
|
||||
|
||||
@Override
|
||||
public long wordMeta() {
|
||||
return base.wordMeta();
|
||||
}
|
||||
|
||||
@Override
|
||||
public int documentFeatures() {
|
||||
return base.documentFeatures();
|
||||
}
|
||||
|
||||
@NotNull
|
||||
@Override
|
||||
public Iterator<IndexJournalEntryTermData> iterator() {
|
||||
|
||||
return new Iterator<>() {
|
||||
private final Iterator<IndexJournalEntryTermData> baseIter = base.iterator();
|
||||
private IndexJournalEntryTermData value = null;
|
||||
|
||||
@Override
|
||||
public boolean hasNext() {
|
||||
if (value != null) {
|
||||
return true;
|
||||
}
|
||||
while (baseIter.hasNext()) {
|
||||
value = baseIter.next();
|
||||
if (filter.test(value.metadata())) {
|
||||
return true;
|
||||
}
|
||||
}
|
||||
value = null;
|
||||
return false;
|
||||
}
|
||||
|
||||
@Override
|
||||
public IndexJournalEntryTermData next() {
|
||||
if (hasNext()) {
|
||||
var ret = value;
|
||||
value = null;
|
||||
return ret;
|
||||
} else {
|
||||
throw new IllegalStateException("No more elements");
|
||||
}
|
||||
}
|
||||
};
|
||||
}
|
||||
|
||||
@Override
|
||||
public void close() throws IOException {
|
||||
base.close();
|
||||
}
|
||||
}
|
@ -1,8 +1,8 @@
|
||||
package nu.marginalia.index.journal.writer;
|
||||
|
||||
import nu.marginalia.index.journal.model.IndexJournalEntry;
|
||||
import nu.marginalia.index.journal.model.IndexJournalEntryData;
|
||||
import nu.marginalia.index.journal.model.IndexJournalEntryHeader;
|
||||
import nu.marginalia.index.journal.model.IndexJournalEntryTermData;
|
||||
|
||||
import java.io.IOException;
|
||||
|
||||
@ -12,18 +12,7 @@ import java.io.IOException;
|
||||
* @see IndexJournalWriterPagingImpl
|
||||
*/
|
||||
public interface IndexJournalWriter extends AutoCloseable {
|
||||
/** Write an entry to the journal.
|
||||
*
|
||||
* @param header the header of the entry
|
||||
* @param entry the data of the entry
|
||||
*
|
||||
* @return the number of bytes written
|
||||
*/
|
||||
int put(IndexJournalEntryHeader header, IndexJournalEntryData entry);
|
||||
default int put(IndexJournalEntry entry) {
|
||||
return put(entry.header(), entry.data());
|
||||
}
|
||||
|
||||
void close() throws IOException;
|
||||
|
||||
int put(IndexJournalEntryHeader header, IndexJournalEntryData data);
|
||||
}
|
||||
|
@ -49,13 +49,14 @@ public class IndexJournalWriterPagingImpl implements IndexJournalWriter {
|
||||
|
||||
@Override
|
||||
@SneakyThrows
|
||||
public int put(IndexJournalEntryHeader header, IndexJournalEntryData entry) {
|
||||
public int put(IndexJournalEntryHeader header, IndexJournalEntryData data)
|
||||
{
|
||||
if (bytesWritten >= sizeLimitBytes) {
|
||||
bytesWritten = 0;
|
||||
switchToNextWriter();
|
||||
}
|
||||
|
||||
int writtenNow = currentWriter.put(header, entry);
|
||||
int writtenNow = currentWriter.put(header, data);
|
||||
bytesWritten += writtenNow;
|
||||
|
||||
return writtenNow;
|
||||
|
@ -2,8 +2,9 @@ package nu.marginalia.index.journal.writer;
|
||||
|
||||
import com.github.luben.zstd.ZstdDirectBufferCompressingStream;
|
||||
import lombok.SneakyThrows;
|
||||
import nu.marginalia.index.journal.model.IndexJournalEntryData;
|
||||
import nu.marginalia.hash.MurmurHash3_128;
|
||||
import nu.marginalia.index.journal.model.IndexJournalEntryHeader;
|
||||
import nu.marginalia.index.journal.model.IndexJournalEntryData;
|
||||
import nu.marginalia.index.journal.reader.IndexJournalReader;
|
||||
import org.slf4j.Logger;
|
||||
import org.slf4j.LoggerFactory;
|
||||
@ -22,6 +23,8 @@ public class IndexJournalWriterSingleFileImpl implements IndexJournalWriter{
|
||||
private static final int ZSTD_BUFFER_SIZE = 8192;
|
||||
private static final int DATA_BUFFER_SIZE = 8192;
|
||||
|
||||
private final MurmurHash3_128 hasher = new MurmurHash3_128();
|
||||
|
||||
private final ByteBuffer dataBuffer = ByteBuffer.allocateDirect(DATA_BUFFER_SIZE);
|
||||
|
||||
private final ZstdDirectBufferCompressingStream compressingStream;
|
||||
@ -75,36 +78,48 @@ public class IndexJournalWriterSingleFileImpl implements IndexJournalWriter{
|
||||
|
||||
@Override
|
||||
@SneakyThrows
|
||||
public int put(IndexJournalEntryHeader header, IndexJournalEntryData entry) {
|
||||
public int put(IndexJournalEntryHeader header,
|
||||
IndexJournalEntryData data)
|
||||
{
|
||||
if (dataBuffer.capacity() - dataBuffer.position() < 3*8) {
|
||||
dataBuffer.flip();
|
||||
compressingStream.compress(dataBuffer);
|
||||
dataBuffer.clear();
|
||||
}
|
||||
|
||||
dataBuffer.putInt(entry.size());
|
||||
final long[] keywords = data.termIds();
|
||||
final long[] metadata = data.metadata();
|
||||
final var positions = data.positions();
|
||||
|
||||
int recordSize = 0; // document header size is 3 longs
|
||||
for (int i = 0; i < keywords.length; i++) {
|
||||
// term header size is 2 longs
|
||||
recordSize += IndexJournalReader.TERM_HEADER_SIZE_BYTES + positions[i].size();
|
||||
}
|
||||
|
||||
dataBuffer.putInt(recordSize);
|
||||
dataBuffer.putInt(header.documentFeatures());
|
||||
dataBuffer.putLong(header.combinedId());
|
||||
dataBuffer.putLong(header.documentMeta());
|
||||
|
||||
for (int i = 0; i < entry.size(); ) {
|
||||
int remaining = (dataBuffer.capacity() - dataBuffer.position()) / 8;
|
||||
if (remaining <= 0) {
|
||||
for (int i = 0; i < keywords.length; i++) {
|
||||
int requiredSize = IndexJournalReader.TERM_HEADER_SIZE_BYTES + positions[i].size();
|
||||
|
||||
if (dataBuffer.capacity() - dataBuffer.position() < requiredSize) {
|
||||
dataBuffer.flip();
|
||||
compressingStream.compress(dataBuffer);
|
||||
dataBuffer.clear();
|
||||
}
|
||||
else while (remaining-- > 0 && i < entry.size()) {
|
||||
|
||||
dataBuffer.putLong(entry.underlyingArray[i++]);
|
||||
}
|
||||
dataBuffer.putLong(keywords[i]);
|
||||
dataBuffer.putLong(metadata[i]);
|
||||
dataBuffer.put((byte) positions[i].size());
|
||||
dataBuffer.put(positions[i].buffer());
|
||||
}
|
||||
|
||||
numEntries++;
|
||||
|
||||
final int bytesWritten = 8 * ( /*header = 3 longs */ 3 + entry.size());
|
||||
|
||||
return bytesWritten;
|
||||
return recordSize;
|
||||
}
|
||||
|
||||
public void close() throws IOException {
|
||||
@ -121,7 +136,7 @@ public class IndexJournalWriterSingleFileImpl implements IndexJournalWriter{
|
||||
|
||||
|
||||
// Finalize the file by writing a header in the beginning
|
||||
ByteBuffer header = ByteBuffer.allocate(16);
|
||||
ByteBuffer header = ByteBuffer.allocate(IndexJournalReader.FILE_HEADER_SIZE_BYTES);
|
||||
header.putLong(numEntries);
|
||||
header.putLong(0); // reserved for future use
|
||||
header.flip();
|
||||
|
@ -1,6 +1,5 @@
|
||||
package nu.marginalia.index.journal;
|
||||
|
||||
import nu.marginalia.index.journal.model.IndexJournalEntry;
|
||||
import nu.marginalia.index.journal.reader.IndexJournalReader;
|
||||
import nu.marginalia.index.journal.reader.IndexJournalReaderSingleFile;
|
||||
import nu.marginalia.index.journal.writer.IndexJournalWriterSingleFileImpl;
|
||||
@ -18,52 +17,52 @@ import java.util.List;
|
||||
import static org.junit.jupiter.api.Assertions.assertEquals;
|
||||
|
||||
public class IndexJournalTest {
|
||||
Path tempFile;
|
||||
IndexJournalReader reader;
|
||||
|
||||
long firstDocId = UrlIdCodec.encodeId(44, 10);
|
||||
long secondDocId = UrlIdCodec.encodeId(43, 15);
|
||||
|
||||
@BeforeEach
|
||||
public void setUp() throws IOException {
|
||||
tempFile = Files.createTempFile(getClass().getSimpleName(), ".dat");
|
||||
|
||||
var journalWriter = new IndexJournalWriterSingleFileImpl( tempFile);
|
||||
journalWriter.put(IndexJournalEntry.builder(44, 10, 55)
|
||||
.add(1, 2)
|
||||
.add(2, 3)
|
||||
.add(3, 4)
|
||||
.add(5, 6).build());
|
||||
|
||||
journalWriter.put(IndexJournalEntry.builder(43, 15, 10)
|
||||
.add(5, 5)
|
||||
.add(6, 6)
|
||||
.build());
|
||||
journalWriter.close();
|
||||
|
||||
reader = new IndexJournalReaderSingleFile(tempFile);
|
||||
}
|
||||
@AfterEach
|
||||
public void tearDown() throws IOException {
|
||||
Files.delete(tempFile);
|
||||
}
|
||||
|
||||
@Test
|
||||
public void forEachDocId() {
|
||||
List<Long> expected = List.of(firstDocId, secondDocId);
|
||||
List<Long> actual = new ArrayList<>();
|
||||
|
||||
reader.forEachDocId(actual::add);
|
||||
assertEquals(expected, actual);
|
||||
}
|
||||
|
||||
@Test
|
||||
public void forEachWordId() {
|
||||
List<Integer> expected = List.of(1, 2, 3, 5, 5 ,6);
|
||||
List<Integer> actual = new ArrayList<>();
|
||||
|
||||
reader.forEachWordId(i -> actual.add((int) i));
|
||||
assertEquals(expected, actual);
|
||||
}
|
||||
// Path tempFile;
|
||||
// IndexJournalReader reader;
|
||||
//
|
||||
// long firstDocId = UrlIdCodec.encodeId(44, 10);
|
||||
// long secondDocId = UrlIdCodec.encodeId(43, 15);
|
||||
//
|
||||
// @BeforeEach
|
||||
// public void setUp() throws IOException {
|
||||
// tempFile = Files.createTempFile(getClass().getSimpleName(), ".dat");
|
||||
//
|
||||
// var journalWriter = new IndexJournalWriterSingleFileImpl( tempFile);
|
||||
// journalWriter.put(IndexJournalEntry.builder(44, 10, 55)
|
||||
// .add(1, 2)
|
||||
// .add(2, 3)
|
||||
// .add(3, 4)
|
||||
// .add(5, 6).build());
|
||||
//
|
||||
// journalWriter.put(IndexJournalEntry.builder(43, 15, 10)
|
||||
// .add(5, 5)
|
||||
// .add(6, 6)
|
||||
// .build());
|
||||
// journalWriter.close();
|
||||
//
|
||||
// reader = new IndexJournalReaderSingleFile(tempFile);
|
||||
// }
|
||||
// @AfterEach
|
||||
// public void tearDown() throws IOException {
|
||||
// Files.delete(tempFile);
|
||||
// }
|
||||
//
|
||||
// @Test
|
||||
// public void forEachDocId() {
|
||||
// List<Long> expected = List.of(firstDocId, secondDocId);
|
||||
// List<Long> actual = new ArrayList<>();
|
||||
//
|
||||
// reader.forEachDocId(actual::add);
|
||||
// assertEquals(expected, actual);
|
||||
// }
|
||||
//
|
||||
// @Test
|
||||
// public void forEachWordId() {
|
||||
// List<Integer> expected = List.of(1, 2, 3, 5, 5 ,6);
|
||||
// List<Integer> actual = new ArrayList<>();
|
||||
//
|
||||
// reader.forEachWordId(i -> actual.add((int) i));
|
||||
// assertEquals(expected, actual);
|
||||
// }
|
||||
|
||||
}
|
||||
|
@ -0,0 +1,367 @@
|
||||
package nu.marginalia.index.journal;
|
||||
|
||||
import it.unimi.dsi.fastutil.ints.IntList;
|
||||
import nu.marginalia.hash.MurmurHash3_128;
|
||||
import nu.marginalia.index.journal.model.IndexJournalEntryData;
|
||||
import nu.marginalia.index.journal.model.IndexJournalEntryHeader;
|
||||
import nu.marginalia.index.journal.model.IndexJournalEntryTermData;
|
||||
import nu.marginalia.index.journal.reader.IndexJournalReaderPagingImpl;
|
||||
import nu.marginalia.index.journal.writer.IndexJournalWriterSingleFileImpl;
|
||||
import nu.marginalia.index.journal.reader.IndexJournalReaderSingleFile;
|
||||
import nu.marginalia.sequence.GammaCodedSequence;
|
||||
import org.junit.jupiter.api.AfterEach;
|
||||
import org.junit.jupiter.api.Assertions;
|
||||
import org.junit.jupiter.api.BeforeEach;
|
||||
import org.junit.jupiter.api.Test;
|
||||
|
||||
import java.io.IOException;
|
||||
import java.nio.ByteBuffer;
|
||||
import java.nio.file.Files;
|
||||
import java.nio.file.Path;
|
||||
import java.util.Iterator;
|
||||
import java.util.List;
|
||||
|
||||
import static org.junit.jupiter.api.Assertions.*;
|
||||
|
||||
public class IndexJournalWriterTest {
|
||||
Path tempFile;
|
||||
Path tempFile2;
|
||||
ByteBuffer workArea = ByteBuffer.allocate(1024);
|
||||
|
||||
@BeforeEach
|
||||
public void setUp() throws IOException {
|
||||
tempFile = Files.createTempFile(getClass().getSimpleName(), ".dat");
|
||||
tempFile2 = Files.createTempFile(getClass().getSimpleName(), ".dat");
|
||||
}
|
||||
@AfterEach
|
||||
public void tearDown() throws IOException {
|
||||
Files.delete(tempFile);
|
||||
Files.delete(tempFile2);
|
||||
}
|
||||
|
||||
private GammaCodedSequence gcs(int... values) {
|
||||
return GammaCodedSequence.generate(workArea, values);
|
||||
}
|
||||
|
||||
static MurmurHash3_128 hasher = new MurmurHash3_128();
|
||||
static long wordId(String str) {
|
||||
return hasher.hashKeyword(str);
|
||||
}
|
||||
|
||||
@Test
|
||||
public void testSingleFile() {
|
||||
try (var writer = new IndexJournalWriterSingleFileImpl(tempFile)) {
|
||||
// Write two documents with two terms each
|
||||
writer.put(new IndexJournalEntryHeader(11, 22, 33),
|
||||
new IndexJournalEntryData(
|
||||
new String[]{"word1", "word2"},
|
||||
new long[]{44, 55},
|
||||
new GammaCodedSequence[]{
|
||||
gcs(1, 3, 5),
|
||||
gcs(2, 4, 6),
|
||||
})
|
||||
);
|
||||
writer.put(new IndexJournalEntryHeader(12, 23, 34),
|
||||
new IndexJournalEntryData(
|
||||
new String[]{"word1", "word2"},
|
||||
new long[]{45, 56},
|
||||
new GammaCodedSequence[]{
|
||||
gcs(2, 4, 6),
|
||||
gcs(3, 5, 7),
|
||||
})
|
||||
);
|
||||
}
|
||||
catch (IOException ex) {
|
||||
Assertions.fail(ex);
|
||||
}
|
||||
|
||||
// Read the journal back
|
||||
|
||||
try {
|
||||
var reader = new IndexJournalReaderSingleFile(tempFile);
|
||||
|
||||
Iterator<IndexJournalEntryTermData> iter;
|
||||
IndexJournalEntryTermData termData;
|
||||
|
||||
try (var ptr = reader.newPointer()) {
|
||||
|
||||
/** DOCUMENT 1 */
|
||||
assertTrue(ptr.nextDocument());
|
||||
assertEquals(11, ptr.documentId());
|
||||
assertEquals(22, ptr.documentFeatures());
|
||||
assertEquals(33, ptr.documentMeta());
|
||||
|
||||
iter = ptr.iterator();
|
||||
|
||||
// Term 1
|
||||
assertTrue(iter.hasNext());
|
||||
termData = iter.next();
|
||||
assertEquals(wordId("word1"), termData.termId());
|
||||
assertEquals(44, termData.metadata());
|
||||
assertEquals(IntList.of(1, 3, 5), termData.positions().values());
|
||||
|
||||
// Term 2
|
||||
assertTrue(iter.hasNext());
|
||||
termData = iter.next();
|
||||
assertEquals(wordId("word2"), termData.termId());
|
||||
assertEquals(55, termData.metadata());
|
||||
assertEquals(IntList.of(2, 4, 6), termData.positions().values());
|
||||
|
||||
// No more terms
|
||||
|
||||
assertFalse(iter.hasNext());
|
||||
|
||||
/** DOCUMENT 2 */
|
||||
assertTrue(ptr.nextDocument());
|
||||
assertEquals(12, ptr.documentId());
|
||||
assertEquals(23, ptr.documentFeatures());
|
||||
assertEquals(34, ptr.documentMeta());
|
||||
|
||||
iter = ptr.iterator();
|
||||
// Term 1
|
||||
assertTrue(iter.hasNext());
|
||||
termData = iter.next();
|
||||
assertEquals(wordId("word1"), termData.termId());
|
||||
assertEquals(45, termData.metadata());
|
||||
assertEquals(IntList.of(2, 4, 6), termData.positions().values());
|
||||
|
||||
// Term 2
|
||||
assertTrue(iter.hasNext());
|
||||
termData = iter.next();
|
||||
assertEquals(wordId("word2"), termData.termId());
|
||||
assertEquals(56, termData.metadata());
|
||||
assertEquals(IntList.of(3, 5, 7), termData.positions().values());
|
||||
|
||||
// No more terms
|
||||
assertFalse(iter.hasNext());
|
||||
|
||||
// No more documents
|
||||
assertFalse(ptr.nextDocument());
|
||||
}
|
||||
}
|
||||
catch (IOException ex) {
|
||||
Assertions.fail(ex);
|
||||
}
|
||||
}
|
||||
|
||||
@Test
|
||||
public void testMultiFile() {
|
||||
try (var writer = new IndexJournalWriterSingleFileImpl(tempFile)) {
|
||||
writer.put(new IndexJournalEntryHeader(11, 22, 33),
|
||||
new IndexJournalEntryData(
|
||||
new String[]{"word1", "word2"},
|
||||
new long[]{44, 55},
|
||||
new GammaCodedSequence[]{
|
||||
gcs(1, 3, 5),
|
||||
gcs(2, 4, 6),
|
||||
})
|
||||
);
|
||||
}
|
||||
catch (IOException ex) {
|
||||
Assertions.fail(ex);
|
||||
}
|
||||
|
||||
try (var writer = new IndexJournalWriterSingleFileImpl(tempFile2)) {
|
||||
writer.put(new IndexJournalEntryHeader(12, 23, 34),
|
||||
new IndexJournalEntryData(
|
||||
new String[]{"word1", "word2"},
|
||||
new long[]{45, 56},
|
||||
new GammaCodedSequence[]{
|
||||
gcs(2, 4, 6),
|
||||
gcs(3, 5, 7),
|
||||
})
|
||||
);
|
||||
}
|
||||
catch (IOException ex) {
|
||||
Assertions.fail(ex);
|
||||
}
|
||||
|
||||
// Read the journal back
|
||||
|
||||
try {
|
||||
var reader = new IndexJournalReaderPagingImpl(List.of(tempFile, tempFile2));
|
||||
|
||||
Iterator<IndexJournalEntryTermData> iter;
|
||||
IndexJournalEntryTermData termData;
|
||||
|
||||
try (var ptr = reader.newPointer()) {
|
||||
|
||||
/** DOCUMENT 1 */
|
||||
assertTrue(ptr.nextDocument());
|
||||
assertEquals(11, ptr.documentId());
|
||||
assertEquals(22, ptr.documentFeatures());
|
||||
assertEquals(33, ptr.documentMeta());
|
||||
|
||||
iter = ptr.iterator();
|
||||
|
||||
// Term 1
|
||||
assertTrue(iter.hasNext());
|
||||
termData = iter.next();
|
||||
assertEquals(wordId("word1"), termData.termId());
|
||||
assertEquals(44, termData.metadata());
|
||||
assertEquals(IntList.of(1, 3, 5), termData.positions().values());
|
||||
|
||||
// Term 2
|
||||
assertTrue(iter.hasNext());
|
||||
termData = iter.next();
|
||||
assertEquals(wordId("word2"), termData.termId());
|
||||
assertEquals(55, termData.metadata());
|
||||
assertEquals(IntList.of(2, 4, 6), termData.positions().values());
|
||||
|
||||
// No more terms
|
||||
|
||||
assertFalse(iter.hasNext());
|
||||
|
||||
/** DOCUMENT 2 */
|
||||
assertTrue(ptr.nextDocument());
|
||||
assertEquals(12, ptr.documentId());
|
||||
assertEquals(23, ptr.documentFeatures());
|
||||
assertEquals(34, ptr.documentMeta());
|
||||
|
||||
iter = ptr.iterator();
|
||||
// Term 1
|
||||
assertTrue(iter.hasNext());
|
||||
termData = iter.next();
|
||||
assertEquals(wordId("word1"), termData.termId());
|
||||
assertEquals(45, termData.metadata());
|
||||
assertEquals(IntList.of(2, 4, 6), termData.positions().values());
|
||||
|
||||
// Term 2
|
||||
assertTrue(iter.hasNext());
|
||||
termData = iter.next();
|
||||
assertEquals(wordId("word2"), termData.termId());
|
||||
assertEquals(56, termData.metadata());
|
||||
assertEquals(IntList.of(3, 5, 7), termData.positions().values());
|
||||
|
||||
// No more terms
|
||||
assertFalse(iter.hasNext());
|
||||
|
||||
// No more documents
|
||||
assertFalse(ptr.nextDocument());
|
||||
}
|
||||
}
|
||||
catch (IOException ex) {
|
||||
Assertions.fail(ex);
|
||||
}
|
||||
}
|
||||
|
||||
@Test
|
||||
public void testSingleFileIterTwice() {
|
||||
try (var writer = new IndexJournalWriterSingleFileImpl(tempFile)) {
|
||||
// Write two documents with two terms each
|
||||
writer.put(new IndexJournalEntryHeader(11, 22, 33),
|
||||
new IndexJournalEntryData(
|
||||
new String[]{"word1", "word2"},
|
||||
new long[]{44, 55},
|
||||
new GammaCodedSequence[]{
|
||||
gcs(1, 3, 5),
|
||||
gcs(2, 4, 6),
|
||||
})
|
||||
);
|
||||
}
|
||||
catch (IOException ex) {
|
||||
Assertions.fail(ex);
|
||||
}
|
||||
|
||||
// Read the journal back
|
||||
|
||||
try {
|
||||
var reader = new IndexJournalReaderSingleFile(tempFile);
|
||||
|
||||
Iterator<IndexJournalEntryTermData> iter;
|
||||
IndexJournalEntryTermData termData;
|
||||
|
||||
try (var ptr = reader.newPointer()) {
|
||||
|
||||
/** DOCUMENT 1 */
|
||||
assertTrue(ptr.nextDocument());
|
||||
assertEquals(11, ptr.documentId());
|
||||
assertEquals(22, ptr.documentFeatures());
|
||||
assertEquals(33, ptr.documentMeta());
|
||||
|
||||
iter = ptr.iterator();
|
||||
// Term 1
|
||||
assertTrue(iter.hasNext());
|
||||
termData = iter.next();
|
||||
assertEquals(wordId("word1"), termData.termId());
|
||||
assertEquals(44, termData.metadata());
|
||||
assertEquals(IntList.of(1, 3, 5), termData.positions().values());
|
||||
|
||||
// Ensure we can iterate again over the same document without persisting state or closing the pointer
|
||||
|
||||
iter = ptr.iterator();
|
||||
// Term 1
|
||||
assertTrue(iter.hasNext());
|
||||
termData = iter.next();
|
||||
assertEquals(wordId("word1"), termData.termId());
|
||||
assertEquals(44, termData.metadata());
|
||||
assertEquals(IntList.of(1, 3, 5), termData.positions().values());
|
||||
}
|
||||
}
|
||||
catch (IOException ex) {
|
||||
Assertions.fail(ex);
|
||||
}
|
||||
}
|
||||
|
||||
@Test
|
||||
public void testFiltered() {
|
||||
try (var writer = new IndexJournalWriterSingleFileImpl(tempFile)) {
|
||||
// Write two documents with two terms each
|
||||
writer.put(new IndexJournalEntryHeader(11, 22, 33),
|
||||
new IndexJournalEntryData(
|
||||
new String[]{"word1", "word2"},
|
||||
new long[]{44, 55},
|
||||
new GammaCodedSequence[]{
|
||||
gcs(1, 3, 5),
|
||||
gcs(2, 4, 6),
|
||||
})
|
||||
);
|
||||
writer.put(new IndexJournalEntryHeader(12, 23, 34),
|
||||
new IndexJournalEntryData(
|
||||
new String[]{"word1", "word2"},
|
||||
new long[]{45, 56},
|
||||
new GammaCodedSequence[]{
|
||||
gcs(2, 4, 6),
|
||||
gcs(3, 5, 7),
|
||||
}
|
||||
));
|
||||
}
|
||||
catch (IOException ex) {
|
||||
Assertions.fail(ex);
|
||||
}
|
||||
|
||||
// Read the journal back
|
||||
|
||||
try {
|
||||
var reader = new IndexJournalReaderSingleFile(tempFile).filtering(meta -> meta == 45);
|
||||
|
||||
Iterator<IndexJournalEntryTermData> iter;
|
||||
IndexJournalEntryTermData termData;
|
||||
|
||||
try (var ptr = reader.newPointer()) {
|
||||
/** DOCUMENT 2 */
|
||||
assertTrue(ptr.nextDocument());
|
||||
assertEquals(12, ptr.documentId());
|
||||
assertEquals(23, ptr.documentFeatures());
|
||||
assertEquals(34, ptr.documentMeta());
|
||||
|
||||
iter = ptr.iterator();
|
||||
// Term 1
|
||||
assertTrue(iter.hasNext());
|
||||
termData = iter.next();
|
||||
assertEquals(wordId("word1"), termData.termId());
|
||||
assertEquals(45, termData.metadata());
|
||||
assertEquals(IntList.of(2, 4, 6), termData.positions().values());
|
||||
|
||||
// No more terms
|
||||
assertFalse(iter.hasNext());
|
||||
// No more documents
|
||||
assertFalse(ptr.nextDocument());
|
||||
}
|
||||
}
|
||||
catch (IOException ex) {
|
||||
Assertions.fail(ex);
|
||||
}
|
||||
}
|
||||
|
||||
}
|
@ -9,125 +9,125 @@ import java.util.ArrayList;
|
||||
import static org.junit.jupiter.api.Assertions.assertEquals;
|
||||
|
||||
class IndexJournalPointerTest {
|
||||
|
||||
@Test
|
||||
public void concatenate() {
|
||||
MockPointer left = new MockPointer(
|
||||
List.of(new MockDocument(1, 2, 3, List.of(
|
||||
new MockRecord(4, 5),
|
||||
new MockRecord(6, 7))
|
||||
))
|
||||
);
|
||||
|
||||
MockPointer right = new MockPointer(
|
||||
List.of(new MockDocument(8, 9, 10, List.of(
|
||||
new MockRecord(11, 12),
|
||||
new MockRecord(13, 14))
|
||||
))
|
||||
);
|
||||
|
||||
IndexJournalPointer concatenated = IndexJournalPointer.concatenate(left, right);
|
||||
List<Long> docIdsSeq = new ArrayList<>();
|
||||
List<Long> wordIdsSeq = new ArrayList<>();
|
||||
while (concatenated.nextDocument()) {
|
||||
docIdsSeq.add(concatenated.documentId());
|
||||
while (concatenated.nextRecord()) {
|
||||
wordIdsSeq.add(concatenated.wordId());
|
||||
}
|
||||
}
|
||||
|
||||
assertEquals(docIdsSeq, List.of(1L, 8L));
|
||||
assertEquals(wordIdsSeq, List.of(4L, 6L, 11L, 13L));
|
||||
}
|
||||
|
||||
@Test
|
||||
public void filter() {
|
||||
MockPointer left = new MockPointer(
|
||||
List.of(new MockDocument(1, 2, 3, List.of(
|
||||
new MockRecord(1, 1),
|
||||
new MockRecord(2, 2),
|
||||
new MockRecord(3, 3),
|
||||
new MockRecord(4, 4),
|
||||
new MockRecord(5, 5)
|
||||
)
|
||||
), new MockDocument(2, 2, 3, List.of(
|
||||
new MockRecord(1, 1),
|
||||
new MockRecord(3, 3),
|
||||
new MockRecord(5, 5)
|
||||
)
|
||||
))
|
||||
|
||||
);
|
||||
var filtered = left.filterWordMeta(meta -> (meta % 2) == 0);
|
||||
|
||||
List<Long> docIdsSeq = new ArrayList<>();
|
||||
List<Long> wordIdsSeq = new ArrayList<>();
|
||||
while (filtered.nextDocument()) {
|
||||
docIdsSeq.add(filtered.documentId());
|
||||
while (filtered.nextRecord()) {
|
||||
wordIdsSeq.add(filtered.wordId());
|
||||
}
|
||||
}
|
||||
|
||||
assertEquals(docIdsSeq, List.of(1L, 2L));
|
||||
assertEquals(wordIdsSeq, List.of(2L, 4L));
|
||||
}
|
||||
|
||||
class MockPointer implements IndexJournalPointer {
|
||||
private final List<MockDocument> documents;
|
||||
|
||||
int di = -1;
|
||||
int ri;
|
||||
|
||||
public MockPointer(Collection<MockDocument> documents) {
|
||||
this.documents = new ArrayList<>(documents);
|
||||
}
|
||||
|
||||
@Override
|
||||
public boolean nextDocument() {
|
||||
if (++di < documents.size()) {
|
||||
ri = -1;
|
||||
return true;
|
||||
}
|
||||
|
||||
return false;
|
||||
}
|
||||
|
||||
@Override
|
||||
public boolean nextRecord() {
|
||||
if (++ri < documents.get(di).records.size()) {
|
||||
return true;
|
||||
}
|
||||
|
||||
return false;
|
||||
}
|
||||
|
||||
@Override
|
||||
public long documentId() {
|
||||
return documents.get(di).docId;
|
||||
}
|
||||
|
||||
@Override
|
||||
public long documentMeta() {
|
||||
return documents.get(di).docMeta;
|
||||
}
|
||||
|
||||
@Override
|
||||
public long wordId() {
|
||||
return documents.get(di).records.get(ri).wordId;
|
||||
}
|
||||
|
||||
@Override
|
||||
public long wordMeta() {
|
||||
return documents.get(di).records.get(ri).wordMeta;
|
||||
}
|
||||
|
||||
@Override
|
||||
public int documentFeatures() {
|
||||
return documents.get(di).docFeatures;
|
||||
}
|
||||
}
|
||||
|
||||
record MockDocument(long docId, long docMeta, int docFeatures, List<MockRecord> records) {}
|
||||
record MockRecord(long wordId, long wordMeta) {}
|
||||
//
|
||||
// @Test
|
||||
// public void concatenate() {
|
||||
// MockPointer left = new MockPointer(
|
||||
// List.of(new MockDocument(1, 2, 3, List.of(
|
||||
// new MockRecord(4, 5),
|
||||
// new MockRecord(6, 7))
|
||||
// ))
|
||||
// );
|
||||
//
|
||||
// MockPointer right = new MockPointer(
|
||||
// List.of(new MockDocument(8, 9, 10, List.of(
|
||||
// new MockRecord(11, 12),
|
||||
// new MockRecord(13, 14))
|
||||
// ))
|
||||
// );
|
||||
//
|
||||
// IndexJournalPointer concatenated = IndexJournalPointer.concatenate(left, right);
|
||||
// List<Long> docIdsSeq = new ArrayList<>();
|
||||
// List<Long> wordIdsSeq = new ArrayList<>();
|
||||
// while (concatenated.nextDocument()) {
|
||||
// docIdsSeq.add(concatenated.documentId());
|
||||
// while (concatenated.nextRecord()) {
|
||||
// wordIdsSeq.add(concatenated.termId());
|
||||
// }
|
||||
// }
|
||||
//
|
||||
// assertEquals(docIdsSeq, List.of(1L, 8L));
|
||||
// assertEquals(wordIdsSeq, List.of(4L, 6L, 11L, 13L));
|
||||
// }
|
||||
//
|
||||
// @Test
|
||||
// public void filter() {
|
||||
// MockPointer left = new MockPointer(
|
||||
// List.of(new MockDocument(1, 2, 3, List.of(
|
||||
// new MockRecord(1, 1),
|
||||
// new MockRecord(2, 2),
|
||||
// new MockRecord(3, 3),
|
||||
// new MockRecord(4, 4),
|
||||
// new MockRecord(5, 5)
|
||||
// )
|
||||
// ), new MockDocument(2, 2, 3, List.of(
|
||||
// new MockRecord(1, 1),
|
||||
// new MockRecord(3, 3),
|
||||
// new MockRecord(5, 5)
|
||||
// )
|
||||
// ))
|
||||
//
|
||||
// );
|
||||
// var filtered = left.filterWordMeta(meta -> (meta % 2) == 0);
|
||||
//
|
||||
// List<Long> docIdsSeq = new ArrayList<>();
|
||||
// List<Long> wordIdsSeq = new ArrayList<>();
|
||||
// while (filtered.nextDocument()) {
|
||||
// docIdsSeq.add(filtered.documentId());
|
||||
// while (filtered.nextRecord()) {
|
||||
// wordIdsSeq.add(filtered.termId());
|
||||
// }
|
||||
// }
|
||||
//
|
||||
// assertEquals(docIdsSeq, List.of(1L, 2L));
|
||||
// assertEquals(wordIdsSeq, List.of(2L, 4L));
|
||||
// }
|
||||
//
|
||||
// class MockPointer implements IndexJournalPointer {
|
||||
// private final List<MockDocument> documents;
|
||||
//
|
||||
// int di = -1;
|
||||
// int ri;
|
||||
//
|
||||
// public MockPointer(Collection<MockDocument> documents) {
|
||||
// this.documents = new ArrayList<>(documents);
|
||||
// }
|
||||
//
|
||||
// @Override
|
||||
// public boolean nextDocument() {
|
||||
// if (++di < documents.size()) {
|
||||
// ri = -1;
|
||||
// return true;
|
||||
// }
|
||||
//
|
||||
// return false;
|
||||
// }
|
||||
//
|
||||
// @Override
|
||||
// public boolean nextRecord() {
|
||||
// if (++ri < documents.get(di).records.size()) {
|
||||
// return true;
|
||||
// }
|
||||
//
|
||||
// return false;
|
||||
// }
|
||||
//
|
||||
// @Override
|
||||
// public long documentId() {
|
||||
// return documents.get(di).docId;
|
||||
// }
|
||||
//
|
||||
// @Override
|
||||
// public long documentMeta() {
|
||||
// return documents.get(di).docMeta;
|
||||
// }
|
||||
//
|
||||
// @Override
|
||||
// public long termId() {
|
||||
// return documents.get(di).records.get(ri).termId;
|
||||
// }
|
||||
//
|
||||
// @Override
|
||||
// public long wordMeta() {
|
||||
// return documents.get(di).records.get(ri).wordMeta;
|
||||
// }
|
||||
//
|
||||
// @Override
|
||||
// public int documentFeatures() {
|
||||
// return documents.get(di).docFeatures;
|
||||
// }
|
||||
// }
|
||||
//
|
||||
// record MockDocument(long docId, long docMeta, int docFeatures, List<MockRecord> records) {}
|
||||
// record MockRecord(long termId, long wordMeta) {}
|
||||
}
|
@ -16,12 +16,16 @@ apply from: "$rootProject.projectDir/srcsets.gradle"
|
||||
dependencies {
|
||||
implementation project(':code:libraries:array')
|
||||
implementation project(':code:libraries:btree')
|
||||
implementation project(':code:libraries:coded-sequence')
|
||||
implementation project(':code:libraries:random-write-funnel')
|
||||
implementation project(':code:index:query')
|
||||
implementation project(':code:index:index-journal')
|
||||
implementation project(':code:common:model')
|
||||
implementation project(':code:common:process')
|
||||
|
||||
implementation project(':third-party:parquet-floor')
|
||||
implementation project(':third-party:commons-codec')
|
||||
|
||||
|
||||
implementation libs.bundles.slf4j
|
||||
|
||||
|
@ -0,0 +1,51 @@
|
||||
package nu.marginalia.index.construction;
|
||||
|
||||
import nu.marginalia.sequence.GammaCodedSequence;
|
||||
|
||||
import java.io.IOException;
|
||||
import java.nio.ByteBuffer;
|
||||
import java.nio.channels.FileChannel;
|
||||
import java.nio.file.Path;
|
||||
import java.nio.file.StandardOpenOption;
|
||||
|
||||
public class PositionsFileConstructor implements AutoCloseable {
|
||||
private final Path file;
|
||||
private final FileChannel channel;
|
||||
|
||||
private long offset;
|
||||
private final ByteBuffer workBuffer = ByteBuffer.allocate(8192);
|
||||
|
||||
public PositionsFileConstructor(Path file) throws IOException {
|
||||
this.file = file;
|
||||
|
||||
channel = FileChannel.open(file, StandardOpenOption.CREATE, StandardOpenOption.WRITE);
|
||||
}
|
||||
|
||||
/** Add a term to the positions file
|
||||
* @param termMeta the term metadata
|
||||
* @param positions the positions of the term
|
||||
* @return the offset of the term in the file
|
||||
*/
|
||||
public long add(byte termMeta, GammaCodedSequence positions) throws IOException {
|
||||
synchronized (file) {
|
||||
var positionBuffer = positions.buffer();
|
||||
int size = 1 + positionBuffer.remaining();
|
||||
|
||||
if (workBuffer.remaining() < size) {
|
||||
workBuffer.flip();
|
||||
channel.write(workBuffer);
|
||||
workBuffer.clear();
|
||||
}
|
||||
workBuffer.put(termMeta);
|
||||
workBuffer.put(positionBuffer);
|
||||
|
||||
offset += size;
|
||||
return offset;
|
||||
}
|
||||
}
|
||||
|
||||
public void close() throws IOException {
|
||||
channel.force(false);
|
||||
channel.close();
|
||||
}
|
||||
}
|
@ -7,6 +7,7 @@ import org.slf4j.Logger;
|
||||
import org.slf4j.LoggerFactory;
|
||||
|
||||
import java.io.IOException;
|
||||
import java.nio.file.Files;
|
||||
import java.nio.file.Path;
|
||||
import java.util.concurrent.atomic.AtomicInteger;
|
||||
|
||||
@ -48,18 +49,22 @@ public class ReverseIndexConstructor {
|
||||
return;
|
||||
}
|
||||
|
||||
Path positionsFile = tmpDir.resolve("positions.dat");
|
||||
Files.deleteIfExists(positionsFile);
|
||||
try (var heartbeat = processHeartbeat.createProcessTaskHeartbeat(CreateReverseIndexSteps.class, processName)) {
|
||||
|
||||
heartbeat.progress(CreateReverseIndexSteps.CONSTRUCT);
|
||||
|
||||
try (var preindexHeartbeat = processHeartbeat.createAdHocTaskHeartbeat("constructPreindexes")) {
|
||||
try (var preindexHeartbeat = processHeartbeat.createAdHocTaskHeartbeat("constructPreindexes");
|
||||
PositionsFileConstructor posConstructor = new PositionsFileConstructor(positionsFile);
|
||||
) {
|
||||
|
||||
AtomicInteger progress = new AtomicInteger(0);
|
||||
inputs
|
||||
.parallelStream()
|
||||
.map(in -> {
|
||||
preindexHeartbeat.progress("PREINDEX/MERGE", progress.incrementAndGet(), inputs.size());
|
||||
return construct(in);
|
||||
return construct(in, posConstructor);
|
||||
})
|
||||
.reduce(this::merge)
|
||||
.ifPresent((index) -> {
|
||||
@ -73,9 +78,9 @@ public class ReverseIndexConstructor {
|
||||
}
|
||||
|
||||
@SneakyThrows
|
||||
private ReversePreindexReference construct(Path input) {
|
||||
private ReversePreindexReference construct(Path input, PositionsFileConstructor positionsFileConstructor) {
|
||||
return ReversePreindex
|
||||
.constructPreindex(readerSource.construct(input), docIdRewriter, tmpDir)
|
||||
.constructPreindex(readerSource.construct(input), positionsFileConstructor, docIdRewriter, tmpDir)
|
||||
.closeToReference();
|
||||
}
|
||||
|
||||
|
@ -40,6 +40,7 @@ public class ReversePreindex {
|
||||
* will have randomly assigned names.
|
||||
*/
|
||||
public static ReversePreindex constructPreindex(IndexJournalReader reader,
|
||||
PositionsFileConstructor positionsFileConstructor,
|
||||
DocIdRewriter docIdRewriter,
|
||||
Path workDir) throws IOException
|
||||
{
|
||||
@ -48,7 +49,7 @@ public class ReversePreindex {
|
||||
Path docsFile = Files.createTempFile(workDir, "docs", ".dat");
|
||||
|
||||
var segments = ReversePreindexWordSegments.construct(reader, segmentWordsFile, segmentCountsFile);
|
||||
var docs = ReversePreindexDocuments.construct(docsFile, workDir, reader, docIdRewriter, segments);
|
||||
var docs = ReversePreindexDocuments.construct(docsFile, workDir, reader, docIdRewriter, positionsFileConstructor, segments);
|
||||
return new ReversePreindex(segments, docs);
|
||||
}
|
||||
|
||||
|
@ -21,6 +21,7 @@ import java.util.concurrent.TimeUnit;
|
||||
* the associated ReversePreindexWordSegments data
|
||||
*/
|
||||
public class ReversePreindexDocuments {
|
||||
private static PositionsFileConstructor positionsFileConstructor;
|
||||
final Path file;
|
||||
public final LongArray documents;
|
||||
private static final int RECORD_SIZE_LONGS = 2;
|
||||
@ -36,7 +37,9 @@ public class ReversePreindexDocuments {
|
||||
Path workDir,
|
||||
IndexJournalReader reader,
|
||||
DocIdRewriter docIdRewriter,
|
||||
PositionsFileConstructor positionsFileConstructor,
|
||||
ReversePreindexWordSegments segments) throws IOException {
|
||||
ReversePreindexDocuments.positionsFileConstructor = positionsFileConstructor;
|
||||
|
||||
createUnsortedDocsFile(docsFile, workDir, reader, segments, docIdRewriter);
|
||||
|
||||
@ -75,14 +78,14 @@ public class ReversePreindexDocuments {
|
||||
var pointer = reader.newPointer();
|
||||
while (pointer.nextDocument()) {
|
||||
long rankEncodedId = docIdRewriter.rewriteDocId(pointer.documentId());
|
||||
while (pointer.nextRecord()) {
|
||||
long wordId = pointer.wordId();
|
||||
long wordMeta = pointer.wordMeta();
|
||||
for (var termData : pointer) {
|
||||
long termId = termData.termId();
|
||||
|
||||
long offset = offsetMap.addTo(wordId, RECORD_SIZE_LONGS);
|
||||
long offset = offsetMap.addTo(termId, RECORD_SIZE_LONGS);
|
||||
long posOffset = positionsFileConstructor.add((byte) termData.metadata(), termData.positions());
|
||||
|
||||
assembly.put(offset + 0, rankEncodedId);
|
||||
assembly.put(offset + 1, wordMeta);
|
||||
assembly.put(offset + 1, posOffset);
|
||||
}
|
||||
}
|
||||
|
||||
|
@ -12,7 +12,7 @@ import java.nio.file.Files;
|
||||
import java.nio.file.Path;
|
||||
|
||||
/** A pair of file-backed arrays of sorted wordIds
|
||||
* and the count of documents associated with each wordId.
|
||||
* and the count of documents associated with each termId.
|
||||
*/
|
||||
public class ReversePreindexWordSegments {
|
||||
public final LongArray wordIds;
|
||||
@ -34,7 +34,7 @@ public class ReversePreindexWordSegments {
|
||||
this.countsFile = countsFile;
|
||||
}
|
||||
|
||||
/** Returns a long-long hash map where each key is a wordId,
|
||||
/** Returns a long-long hash map where each key is a termId,
|
||||
* and each value is the start offset of the data.
|
||||
*/
|
||||
public Long2LongOpenHashMap asMap(int recordSize) {
|
||||
@ -188,7 +188,7 @@ public class ReversePreindexWordSegments {
|
||||
|
||||
if (i == fileSize) {
|
||||
// We've reached the end of the iteration and there is no
|
||||
// "next" wordId to fetch
|
||||
// "next" termId to fetch
|
||||
wordId = Long.MIN_VALUE;
|
||||
return false;
|
||||
}
|
||||
|
@ -2,12 +2,14 @@ package nu.marginalia.index;
|
||||
|
||||
import nu.marginalia.array.page.LongQueryBuffer;
|
||||
import nu.marginalia.index.construction.DocIdRewriter;
|
||||
import nu.marginalia.index.construction.PositionsFileConstructor;
|
||||
import nu.marginalia.index.construction.ReversePreindex;
|
||||
import nu.marginalia.index.construction.TestJournalFactory;
|
||||
import nu.marginalia.index.construction.TestJournalFactory.EntryDataWithWordMeta;
|
||||
import org.junit.jupiter.api.AfterEach;
|
||||
import org.junit.jupiter.api.BeforeEach;
|
||||
import org.junit.jupiter.api.Test;
|
||||
import org.mockito.Mockito;
|
||||
|
||||
import java.io.IOException;
|
||||
import java.nio.file.Files;
|
||||
@ -89,7 +91,9 @@ class ReverseIndexReaderTest {
|
||||
|
||||
private ReverseIndexReader createIndex(EntryDataWithWordMeta... scenario) throws IOException {
|
||||
var reader = journalFactory.createReader(scenario);
|
||||
var preindex = ReversePreindex.constructPreindex(reader, DocIdRewriter.identity(), tempDir);
|
||||
var preindex = ReversePreindex.constructPreindex(reader,
|
||||
Mockito.mock(PositionsFileConstructor.class),
|
||||
DocIdRewriter.identity(), tempDir);
|
||||
|
||||
|
||||
Path docsFile = tempDir.resolve("docs.dat");
|
||||
|
@ -100,6 +100,7 @@ class ReversePreindexDocsTest {
|
||||
|
||||
assertEquals(expected, actual);
|
||||
}
|
||||
|
||||
@Test
|
||||
public void testDocs2() throws IOException {
|
||||
var reader = journalFactory.createReader(
|
||||
@ -108,7 +109,7 @@ class ReversePreindexDocsTest {
|
||||
);
|
||||
|
||||
var segments = ReversePreindexWordSegments.construct(reader, wordsIdFile, countsFile);
|
||||
var docs = ReversePreindexDocuments.construct(docsFile, tempDir, reader, DocIdRewriter.identity(), segments);
|
||||
var docs = ReversePreindexDocuments.construct(docsFile, tempDir, reader, DocIdRewriter.identity(), segme.nts);
|
||||
|
||||
List<TestSegmentData> expected = List.of(
|
||||
new TestSegmentData(-100, 0, 4, new long[] { -0xF00BA3L, 0, 0xF00BA4L, 0 }),
|
||||
|
@ -5,6 +5,7 @@ import nu.marginalia.index.journal.model.IndexJournalEntryHeader;
|
||||
import nu.marginalia.index.journal.reader.IndexJournalReader;
|
||||
import nu.marginalia.index.journal.reader.IndexJournalReaderSingleFile;
|
||||
import nu.marginalia.index.journal.writer.IndexJournalWriterSingleFileImpl;
|
||||
import nu.marginalia.sequence.GammaCodedSequence;
|
||||
|
||||
import java.io.IOException;
|
||||
import java.nio.file.Files;
|
||||
@ -60,12 +61,18 @@ public class TestJournalFactory {
|
||||
|
||||
var writer = new IndexJournalWriterSingleFileImpl(jf);
|
||||
for (var entry : entries) {
|
||||
long[] data = new long[entry.wordIds.length * 2];
|
||||
for (int i = 0; i < entry.wordIds.length; i++)
|
||||
data[i*2] = entry.wordIds[i];
|
||||
long[] termIds = new long[entry.wordIds.length];
|
||||
long[] meta = new long[entry.wordIds.length];
|
||||
|
||||
GammaCodedSequence[] positions = new GammaCodedSequence[entry.wordIds.length];
|
||||
for (int i = 0; i < entry.wordIds.length; i++) {
|
||||
termIds[i] = entry.wordIds[i];
|
||||
meta[i] = 0;
|
||||
positions[i] = new GammaCodedSequence(new byte[1]);
|
||||
}
|
||||
|
||||
writer.put(new IndexJournalEntryHeader(entries.length, 0, entry.docId, entry.docMeta),
|
||||
new IndexJournalEntryData(data));
|
||||
new IndexJournalEntryData(termIds, meta, positions));
|
||||
}
|
||||
writer.close();
|
||||
var ret = new IndexJournalReaderSingleFile(jf);
|
||||
@ -77,14 +84,18 @@ public class TestJournalFactory {
|
||||
|
||||
var writer = new IndexJournalWriterSingleFileImpl(jf);
|
||||
for (var entry : entries) {
|
||||
long[] data = new long[entry.wordIds.length * 2];
|
||||
|
||||
long[] termIds = new long[entry.wordIds.length];
|
||||
long[] meta = new long[entry.wordIds.length];
|
||||
GammaCodedSequence[] positions = new GammaCodedSequence[entry.wordIds.length];
|
||||
for (int i = 0; i < entry.wordIds.length; i++) {
|
||||
data[i * 2] = entry.wordIds[i].wordId;
|
||||
data[i * 2 + 1] = entry.wordIds[i].meta;
|
||||
termIds[i] = entry.wordIds[i].wordId;
|
||||
meta[i] = entry.wordIds[i].meta;
|
||||
positions[i] = new GammaCodedSequence(new byte[1]);
|
||||
}
|
||||
|
||||
writer.put(new IndexJournalEntryHeader(entries.length, 0, entry.docId, entry.docMeta),
|
||||
new IndexJournalEntryData(data));
|
||||
new IndexJournalEntryData(termIds, meta, positions));
|
||||
}
|
||||
writer.close();
|
||||
var ret = new IndexJournalReaderSingleFile(jf);
|
||||
|
@ -8,15 +8,16 @@ import nu.marginalia.api.searchquery.model.query.SearchSpecification;
|
||||
import nu.marginalia.api.searchquery.model.query.SearchQuery;
|
||||
import nu.marginalia.api.searchquery.model.results.ResultRankingParameters;
|
||||
import nu.marginalia.index.index.StatefulIndex;
|
||||
import nu.marginalia.index.journal.model.IndexJournalEntryData;
|
||||
import nu.marginalia.process.control.FakeProcessHeartbeat;
|
||||
import nu.marginalia.process.control.ProcessHeartbeat;
|
||||
import nu.marginalia.sequence.GammaCodedSequence;
|
||||
import nu.marginalia.storage.FileStorageService;
|
||||
import nu.marginalia.hash.MurmurHash3_128;
|
||||
import nu.marginalia.index.construction.DocIdRewriter;
|
||||
import nu.marginalia.index.construction.ReverseIndexConstructor;
|
||||
import nu.marginalia.index.forward.ForwardIndexConverter;
|
||||
import nu.marginalia.index.forward.ForwardIndexFileNames;
|
||||
import nu.marginalia.index.journal.model.IndexJournalEntryData;
|
||||
import nu.marginalia.index.journal.model.IndexJournalEntryHeader;
|
||||
import nu.marginalia.index.journal.reader.IndexJournalReader;
|
||||
import nu.marginalia.index.journal.writer.IndexJournalWriter;
|
||||
@ -41,6 +42,7 @@ import org.junit.jupiter.api.Test;
|
||||
import org.junit.jupiter.api.parallel.Execution;
|
||||
|
||||
import java.io.IOException;
|
||||
import java.nio.ByteBuffer;
|
||||
import java.nio.file.Files;
|
||||
import java.nio.file.Path;
|
||||
import java.sql.SQLException;
|
||||
@ -300,7 +302,18 @@ public class IndexQueryServiceIntegrationSmokeTest {
|
||||
"test", "test", 0., "HTML5", 0, null, 0, 10
|
||||
));
|
||||
|
||||
indexJournalWriter.put(header, new IndexJournalEntryData(data));
|
||||
String[] keywords = IntStream.range(0, factors.length).mapToObj(Integer::toString).toArray(String[]::new);
|
||||
long[] metadata = new long[factors.length];
|
||||
for (int i = 0; i < factors.length; i++) {
|
||||
metadata[i] = new WordMetadata(i, EnumSet.of(WordFlags.Title)).encode();
|
||||
}
|
||||
GammaCodedSequence[] positions = new GammaCodedSequence[factors.length];
|
||||
ByteBuffer wa = ByteBuffer.allocate(16);
|
||||
for (int i = 0; i < factors.length; i++) {
|
||||
positions[i] = GammaCodedSequence.generate(wa, i + 1);
|
||||
}
|
||||
|
||||
indexJournalWriter.put(header, new IndexJournalEntryData(keywords, metadata, positions));
|
||||
}
|
||||
|
||||
@SneakyThrows
|
||||
@ -309,19 +322,24 @@ public class IndexQueryServiceIntegrationSmokeTest {
|
||||
long fullId = UrlIdCodec.encodeId(domain, id);
|
||||
var header = new IndexJournalEntryHeader(factors.length, 0, fullId, DocumentMetadata.defaultValue());
|
||||
|
||||
long[] data = new long[factors.length*2];
|
||||
for (int i = 0; i < factors.length; i++) {
|
||||
data[2*i] = hasher.hashNearlyASCII(Integer.toString(factors[i]));
|
||||
data[2*i + 1] = new WordMetadata(i, EnumSet.of(WordFlags.Title)).encode();
|
||||
}
|
||||
|
||||
ldbw.add(new DocdbUrlDetail(
|
||||
fullId, new EdgeUrl("https://www.example.com/"+id),
|
||||
"test", "test", 0., "HTML5", 0, null, 0, 10
|
||||
));
|
||||
|
||||
|
||||
indexJournalWriter.put(header, new IndexJournalEntryData(data));
|
||||
String[] keywords = IntStream.range(0, factors.length).mapToObj(Integer::toString).toArray(String[]::new);
|
||||
long[] metadata = new long[factors.length];
|
||||
for (int i = 0; i < factors.length; i++) {
|
||||
metadata[i] = new WordMetadata(i, EnumSet.of(WordFlags.Title)).encode();
|
||||
}
|
||||
GammaCodedSequence[] positions = new GammaCodedSequence[factors.length];
|
||||
ByteBuffer wa = ByteBuffer.allocate(16);
|
||||
for (int i = 0; i < factors.length; i++) {
|
||||
positions[i] = GammaCodedSequence.generate(wa, i);
|
||||
}
|
||||
|
||||
indexJournalWriter.put(header, new IndexJournalEntryData(keywords, metadata, positions));
|
||||
}
|
||||
|
||||
}
|
||||
|
@ -7,13 +7,14 @@ import nu.marginalia.api.searchquery.model.query.SearchSpecification;
|
||||
import nu.marginalia.api.searchquery.model.query.SearchQuery;
|
||||
import nu.marginalia.api.searchquery.model.results.ResultRankingParameters;
|
||||
import nu.marginalia.index.index.StatefulIndex;
|
||||
import nu.marginalia.index.journal.model.IndexJournalEntryData;
|
||||
import nu.marginalia.sequence.GammaCodedSequence;
|
||||
import nu.marginalia.storage.FileStorageService;
|
||||
import nu.marginalia.hash.MurmurHash3_128;
|
||||
import nu.marginalia.index.construction.DocIdRewriter;
|
||||
import nu.marginalia.index.construction.ReverseIndexConstructor;
|
||||
import nu.marginalia.index.forward.ForwardIndexConverter;
|
||||
import nu.marginalia.index.forward.ForwardIndexFileNames;
|
||||
import nu.marginalia.index.journal.model.IndexJournalEntryData;
|
||||
import nu.marginalia.index.journal.model.IndexJournalEntryHeader;
|
||||
import nu.marginalia.index.journal.reader.IndexJournalReader;
|
||||
import nu.marginalia.index.journal.writer.IndexJournalWriter;
|
||||
@ -44,6 +45,7 @@ import org.junit.jupiter.api.parallel.Execution;
|
||||
import javax.annotation.CheckReturnValue;
|
||||
import java.io.IOException;
|
||||
import java.net.URISyntaxException;
|
||||
import java.nio.ByteBuffer;
|
||||
import java.nio.file.Files;
|
||||
import java.nio.file.Path;
|
||||
import java.sql.SQLException;
|
||||
@ -549,13 +551,13 @@ public class IndexQueryServiceIntegrationTest {
|
||||
meta.documentMetadata.encode()
|
||||
);
|
||||
|
||||
long[] dataArray = new long[words.size() * 2];
|
||||
for (int i = 0; i < words.size(); i++) {
|
||||
dataArray[2*i] = hasher.hashNearlyASCII(words.get(i).keyword);
|
||||
dataArray[2*i+1] = words.get(i).termMetadata;
|
||||
}
|
||||
var entry = new IndexJournalEntryData(dataArray);
|
||||
indexJournalWriter.put(header, entry);
|
||||
String[] keywords = words.stream().map(w -> w.keyword).toArray(String[]::new);
|
||||
long[] metadata = words.stream().map(w -> w.termMetadata).mapToLong(Long::longValue).toArray();
|
||||
GammaCodedSequence[] positions = new GammaCodedSequence[words.size()]; // FIXME: positions?
|
||||
Arrays.setAll(positions, i -> new GammaCodedSequence(ByteBuffer.allocate(1)));
|
||||
|
||||
indexJournalWriter.put(header,
|
||||
new IndexJournalEntryData(keywords, metadata, positions));
|
||||
});
|
||||
|
||||
var linkdbWriter = new DocumentDbWriter(
|
||||
|
@ -7,18 +7,30 @@ import nu.marginalia.sequence.io.BitWriter;
|
||||
|
||||
import java.nio.ByteBuffer;
|
||||
|
||||
/** Implement coding and decoding of sequences of integers using the Elias Gamma code
|
||||
*
|
||||
* https://en.wikipedia.org/wiki/Elias_gamma_coding
|
||||
/** Implement coding and decoding of sequences of integers using the Elias Gamma code.
|
||||
* The sequence is prefixed by the number of integers in the sequence, then the delta between
|
||||
* each integer in the sequence is encoded using the Elias Gamma code.
|
||||
* <p></p>
|
||||
* <a href="https://en.wikipedia.org/wiki/Elias_gamma_coding">https://en.wikipedia.org/wiki/Elias_gamma_coding</a>
|
||||
* */
|
||||
public class EliasGammaCodec implements IntIterator {
|
||||
|
||||
private final BitReader reader;
|
||||
int rem = 0;
|
||||
private int last = 0;
|
||||
private int next = 0;
|
||||
|
||||
private EliasGammaCodec(ByteBuffer buffer) {
|
||||
reader = new BitReader(buffer);
|
||||
|
||||
int bits = reader.takeWhileZero();
|
||||
|
||||
if (!reader.hasMore()) {
|
||||
rem = 0;
|
||||
}
|
||||
else {
|
||||
rem = reader.get(bits);
|
||||
}
|
||||
}
|
||||
|
||||
/** Decode a sequence of integers from a ByteBuffer using the Elias Gamma code */
|
||||
@ -31,7 +43,13 @@ public class EliasGammaCodec implements IntIterator {
|
||||
* or equal to zero.
|
||||
*/
|
||||
public static ByteBuffer encode(ByteBuffer workArea, IntList sequence) {
|
||||
if (sequence.isEmpty())
|
||||
return ByteBuffer.allocate(0);
|
||||
|
||||
var writer = new BitWriter(workArea);
|
||||
|
||||
writer.putGammaCoded(sequence.size());
|
||||
|
||||
int last = 0;
|
||||
|
||||
for (var iter = sequence.iterator(); iter.hasNext(); ) {
|
||||
@ -42,9 +60,7 @@ public class EliasGammaCodec implements IntIterator {
|
||||
// can't encode zeroes
|
||||
assert delta > 0 : "Sequence must be strictly increasing and may not contain zeroes or negative values";
|
||||
|
||||
int bits = Integer.numberOfTrailingZeros(Integer.highestOneBit(delta));
|
||||
writer.put(0, bits + 1);
|
||||
writer.put(delta, bits + 1);
|
||||
writer.putGammaCoded(delta);
|
||||
}
|
||||
|
||||
return writer.finish();
|
||||
@ -60,16 +76,13 @@ public class EliasGammaCodec implements IntIterator {
|
||||
|
||||
@Override
|
||||
public boolean hasNext() {
|
||||
if (next > 0)
|
||||
return true;
|
||||
if (!reader.hasMore())
|
||||
return false;
|
||||
if (next > 0) return true;
|
||||
if (!reader.hasMore() || --rem < 0) return false;
|
||||
|
||||
int bits = reader.takeWhileZero();
|
||||
|
||||
if (!reader.hasMore()) {
|
||||
return false;
|
||||
}
|
||||
if (!reader.hasMore()) return false;
|
||||
|
||||
int delta = reader.get(bits);
|
||||
last += delta;
|
||||
next = last;
|
||||
|
@ -16,6 +16,8 @@ import java.util.StringJoiner;
|
||||
* */
|
||||
public class GammaCodedSequence implements BinarySerializable, Iterable<Integer> {
|
||||
private final ByteBuffer raw;
|
||||
int startPos = 0;
|
||||
int startLimit = 0;
|
||||
|
||||
/** Create a new GammaCodedSequence from a sequence of integers.
|
||||
*
|
||||
@ -37,12 +39,16 @@ public class GammaCodedSequence implements BinarySerializable, Iterable<Integer>
|
||||
|
||||
public GammaCodedSequence(ByteBuffer bytes) {
|
||||
this.raw = bytes;
|
||||
startPos = bytes.position();
|
||||
startLimit = bytes.limit();
|
||||
}
|
||||
|
||||
public GammaCodedSequence(byte[] bytes) {
|
||||
raw = ByteBuffer.allocate(bytes.length);
|
||||
raw.put(bytes);
|
||||
raw.clear();
|
||||
startPos = 0;
|
||||
startLimit = bytes.length;
|
||||
}
|
||||
|
||||
/** Return the raw bytes of the sequence. */
|
||||
@ -52,21 +58,29 @@ public class GammaCodedSequence implements BinarySerializable, Iterable<Integer>
|
||||
return raw.array();
|
||||
}
|
||||
else {
|
||||
raw.clear();
|
||||
|
||||
byte[] bytes = new byte[raw.capacity()];
|
||||
raw.get(bytes, 0, bytes.length);
|
||||
raw.get(0, bytes, 0, bytes.length);
|
||||
return bytes;
|
||||
}
|
||||
}
|
||||
|
||||
@Override
|
||||
public IntIterator iterator() {
|
||||
raw.clear();
|
||||
raw.position(startPos);
|
||||
raw.limit(startLimit);
|
||||
|
||||
return EliasGammaCodec.decode(raw);
|
||||
}
|
||||
|
||||
public IntList values() {
|
||||
var intItr = iterator();
|
||||
IntArrayList ret = new IntArrayList(8);
|
||||
while (intItr.hasNext()) {
|
||||
ret.add(intItr.nextInt());
|
||||
}
|
||||
return ret;
|
||||
}
|
||||
|
||||
/** Decode the sequence into an IntList;
|
||||
* this is a somewhat slow operation,
|
||||
* iterating over the data directly more performant */
|
||||
@ -94,4 +108,15 @@ public class GammaCodedSequence implements BinarySerializable, Iterable<Integer>
|
||||
}
|
||||
return sj.toString();
|
||||
}
|
||||
|
||||
public ByteBuffer buffer() {
|
||||
raw.position(startPos);
|
||||
raw.limit(startLimit);
|
||||
|
||||
return raw;
|
||||
}
|
||||
|
||||
public int size() {
|
||||
return raw.capacity();
|
||||
}
|
||||
}
|
||||
|
@ -78,7 +78,7 @@ public class BitReader {
|
||||
|
||||
int result = 0;
|
||||
|
||||
for (;;) {
|
||||
do {
|
||||
// Ensure we have bits to read
|
||||
if (bitPosition <= 0) {
|
||||
if (underlying.hasRemaining())
|
||||
@ -96,10 +96,8 @@ public class BitReader {
|
||||
// Subtract the number of bits read from the current position
|
||||
bitPosition -= zeroes;
|
||||
|
||||
// If bitPosition isn't zero, we've found a 1 and can stop
|
||||
if (bitPosition > 0)
|
||||
break;
|
||||
}
|
||||
// If bit position is not positive, we've found a 1 and can stop
|
||||
} while (bitPosition <= 0);
|
||||
|
||||
return result;
|
||||
}
|
||||
|
@ -72,6 +72,17 @@ public class BitWriter {
|
||||
}
|
||||
}
|
||||
|
||||
/** Write the provided value in a gamma-coded format,
|
||||
* e.g. by first finding the number of significant bits,
|
||||
* then writing that many zeroes, then the bits themselves
|
||||
*/
|
||||
public void putGammaCoded(int value) {
|
||||
int bits = 1 + Integer.numberOfTrailingZeros(Integer.highestOneBit(value));
|
||||
|
||||
put(0, bits);
|
||||
put(value, bits);
|
||||
}
|
||||
|
||||
public ByteBuffer finish() {
|
||||
finishLastByte();
|
||||
|
||||
|
@ -115,16 +115,17 @@ class BitReaderTest {
|
||||
}
|
||||
|
||||
@Test
|
||||
public void testTakeWhileZeroOverInt32() {
|
||||
public void testTakeWhileZeroOverInt64() {
|
||||
var writer = new BitWriter(ByteBuffer.allocate(1024));
|
||||
writer.put(0, 32);
|
||||
writer.put(0, 32);
|
||||
writer.put(0, 2);
|
||||
writer.putBit(true);
|
||||
var buffer = writer.finish();
|
||||
|
||||
var reader = new BitReader(buffer);
|
||||
int val = reader.takeWhileZero();
|
||||
assertEquals(34, val);
|
||||
assertEquals(66, val);
|
||||
assertTrue(reader.getBit());
|
||||
}
|
||||
}
|
@ -4,9 +4,9 @@ import com.google.inject.Inject;
|
||||
import com.google.inject.Singleton;
|
||||
import lombok.SneakyThrows;
|
||||
import nu.marginalia.IndexLocations;
|
||||
import nu.marginalia.index.journal.model.IndexJournalEntryData;
|
||||
import nu.marginalia.storage.FileStorageService;
|
||||
import nu.marginalia.hash.MurmurHash3_128;
|
||||
import nu.marginalia.index.journal.model.IndexJournalEntryData;
|
||||
import nu.marginalia.index.journal.model.IndexJournalEntryHeader;
|
||||
import nu.marginalia.index.journal.writer.IndexJournalWriterPagingImpl;
|
||||
import nu.marginalia.index.journal.writer.IndexJournalWriter;
|
||||
@ -18,9 +18,7 @@ import org.slf4j.LoggerFactory;
|
||||
|
||||
import java.io.IOException;
|
||||
import java.nio.file.Files;
|
||||
import java.sql.SQLException;
|
||||
|
||||
import static nu.marginalia.index.journal.model.IndexJournalEntryData.MAX_LENGTH;
|
||||
|
||||
@Singleton
|
||||
public class LoaderIndexJournalWriter {
|
||||
@ -28,12 +26,11 @@ public class LoaderIndexJournalWriter {
|
||||
private final IndexJournalWriter indexWriter;
|
||||
private static final Logger logger = LoggerFactory.getLogger(LoaderIndexJournalWriter.class);
|
||||
|
||||
private final MurmurHash3_128 hasher = new MurmurHash3_128();
|
||||
private final long[] buffer = new long[MAX_LENGTH * 2];
|
||||
private final long[] buffer = new long[65536];
|
||||
|
||||
|
||||
@Inject
|
||||
public LoaderIndexJournalWriter(FileStorageService fileStorageService) throws IOException, SQLException {
|
||||
public LoaderIndexJournalWriter(FileStorageService fileStorageService) throws IOException {
|
||||
var indexArea = IndexLocations.getIndexConstructionArea(fileStorageService);
|
||||
|
||||
var existingIndexFiles = IndexJournalFileNames.findJournalFiles(indexArea);
|
||||
@ -68,26 +65,10 @@ public class LoaderIndexJournalWriter {
|
||||
return;
|
||||
}
|
||||
|
||||
var pointer = wordSet.newPointer();
|
||||
|
||||
while (pointer.hasMore()) {
|
||||
int i = 0;
|
||||
|
||||
while (i < buffer.length
|
||||
&& pointer.advancePointer())
|
||||
{
|
||||
final long hashedKeyword = hasher.hashKeyword(pointer.getKeyword());
|
||||
|
||||
buffer[i++] = hashedKeyword;
|
||||
buffer[i++] = pointer.getMetadata();
|
||||
}
|
||||
|
||||
var entry = new IndexJournalEntryData(i, buffer);
|
||||
var header = new IndexJournalEntryHeader(combinedId, features, metadata);
|
||||
|
||||
indexWriter.put(header, entry);
|
||||
}
|
||||
var header = new IndexJournalEntryHeader(combinedId, features, metadata);
|
||||
var data = new IndexJournalEntryData(wordSet.keywords, wordSet.metadata, wordSet.positions);
|
||||
|
||||
indexWriter.put(header, data);
|
||||
}
|
||||
|
||||
public void close() throws Exception {
|
||||
|
@ -1,87 +0,0 @@
|
||||
package nu.marginalia.loading.loader;
|
||||
|
||||
import nu.marginalia.sequence.GammaCodedSequence;
|
||||
import nu.marginalia.storage.FileStorageService;
|
||||
import nu.marginalia.storage.model.FileStorageBase;
|
||||
import nu.marginalia.storage.model.FileStorageBaseType;
|
||||
import nu.marginalia.index.journal.reader.IndexJournalReaderSingleFile;
|
||||
import nu.marginalia.keyword.model.DocumentKeywords;
|
||||
import nu.marginalia.loading.LoaderIndexJournalWriter;
|
||||
import nu.marginalia.model.idx.DocumentMetadata;
|
||||
import nu.marginalia.index.journal.IndexJournalFileNames;
|
||||
import org.junit.jupiter.api.AfterEach;
|
||||
import org.junit.jupiter.api.BeforeEach;
|
||||
import org.junit.jupiter.api.Test;
|
||||
import org.mockito.Mockito;
|
||||
|
||||
import java.io.IOException;
|
||||
import java.nio.ByteBuffer;
|
||||
import java.nio.file.Files;
|
||||
import java.nio.file.Path;
|
||||
import java.sql.SQLException;
|
||||
import java.util.ArrayList;
|
||||
import java.util.List;
|
||||
import java.util.stream.LongStream;
|
||||
|
||||
import static org.junit.jupiter.api.Assertions.*;
|
||||
|
||||
class LoaderIndexJournalWriterTest {
|
||||
|
||||
Path tempDir;
|
||||
LoaderIndexJournalWriter writer;
|
||||
@BeforeEach
|
||||
public void setUp() throws IOException, SQLException {
|
||||
tempDir = Files.createTempDirectory(getClass().getSimpleName());
|
||||
FileStorageService storageService = Mockito.mock(FileStorageService.class);
|
||||
|
||||
Mockito.when(storageService.getStorageBase(FileStorageBaseType.CURRENT)).thenReturn(new FileStorageBase(null, null, 1,null, tempDir.toString()));
|
||||
|
||||
writer = new LoaderIndexJournalWriter(storageService);
|
||||
}
|
||||
|
||||
@AfterEach
|
||||
public void tearDown() throws Exception {
|
||||
writer.close();
|
||||
List<Path> junk = Files.list(tempDir.resolve("iw")).toList();
|
||||
for (var item : junk)
|
||||
Files.delete(item);
|
||||
Files.delete(tempDir.resolve("iw"));
|
||||
Files.delete(tempDir);
|
||||
}
|
||||
|
||||
@Test
|
||||
public void testBreakup() throws Exception {
|
||||
String[] keywords = new String[2000];
|
||||
long[] metadata = new long[2000];
|
||||
GammaCodedSequence[] positions = new GammaCodedSequence[2000];
|
||||
ByteBuffer workArea = ByteBuffer.allocate(1024);
|
||||
for (int i = 0; i < 2000; i++) {
|
||||
keywords[i] = Integer.toString(i);
|
||||
metadata[i] = i+1;
|
||||
positions[i] = GammaCodedSequence.generate(workArea, 1, 2, 3);
|
||||
}
|
||||
DocumentKeywords words = new DocumentKeywords(keywords, metadata, positions);
|
||||
writer.putWords(1, 0, new DocumentMetadata(0),
|
||||
words);
|
||||
|
||||
writer.close();
|
||||
|
||||
List<Path> journalFiles = IndexJournalFileNames.findJournalFiles(tempDir.resolve("iw"));
|
||||
assertEquals(1, journalFiles.size());
|
||||
|
||||
var reader = new IndexJournalReaderSingleFile(journalFiles.get(0));
|
||||
List<Long> docIds = new ArrayList<>();
|
||||
reader.forEachDocId(docIds::add);
|
||||
assertEquals(List.of(1L, 1L), docIds);
|
||||
|
||||
List<Long> metas = new ArrayList<Long>();
|
||||
var ptr = reader.newPointer();
|
||||
while (ptr.nextDocument()) {
|
||||
while (ptr.nextRecord()) {
|
||||
metas.add(ptr.wordMeta());
|
||||
}
|
||||
}
|
||||
|
||||
assertEquals(LongStream.of(metadata).boxed().toList(), metas);
|
||||
}
|
||||
}
|
@ -33,6 +33,7 @@ public class SearchMain extends MainClass {
|
||||
new ServiceDiscoveryModule(),
|
||||
new DatabaseModule(false)
|
||||
);
|
||||
|
||||
|
||||
// Orchestrate the boot order for the services
|
||||
var registry = injector.getInstance(ServiceRegistryIf.class);
|
||||
|
Loading…
Reference in New Issue
Block a user