diff --git a/code/features-convert/keyword-extraction/java/nu/marginalia/keyword/model/DocumentKeywords.java b/code/features-convert/keyword-extraction/java/nu/marginalia/keyword/model/DocumentKeywords.java index ab50fef5..e4916e31 100644 --- a/code/features-convert/keyword-extraction/java/nu/marginalia/keyword/model/DocumentKeywords.java +++ b/code/features-convert/keyword-extraction/java/nu/marginalia/keyword/model/DocumentKeywords.java @@ -1,6 +1,5 @@ package nu.marginalia.keyword.model; -import nu.marginalia.model.idx.WordMetadata; import nu.marginalia.sequence.GammaCodedSequence; import java.io.Serial; @@ -26,26 +25,6 @@ public final class DocumentKeywords implements Serializable { assert keywords.length == metadata.length; } - @Override - public String toString() { - StringBuilder sb = new StringBuilder(); - sb.append(getClass().getSimpleName()); - sb.append('['); - var pointer = newPointer(); - while (pointer.advancePointer()) { - sb.append("\n\t "); - - long metadata = pointer.getMetadata(); - String keyword = pointer.getKeyword(); - sb.append(keyword); - - if (metadata != 0) { - sb.append("/").append(new WordMetadata(metadata)); - } - } - return sb.append("\n]").toString(); - } - public boolean isEmpty() { return keywords.length == 0; } @@ -54,11 +33,6 @@ public final class DocumentKeywords implements Serializable { return keywords.length; } - /** Return a pointer for traversing this structure */ - public DocumentKeywordsPointer newPointer() { - return new DocumentKeywordsPointer(this); - } - } diff --git a/code/features-convert/keyword-extraction/java/nu/marginalia/keyword/model/DocumentKeywordsBuilder.java b/code/features-convert/keyword-extraction/java/nu/marginalia/keyword/model/DocumentKeywordsBuilder.java index 46bc2c15..a88dca0e 100644 --- a/code/features-convert/keyword-extraction/java/nu/marginalia/keyword/model/DocumentKeywordsBuilder.java +++ b/code/features-convert/keyword-extraction/java/nu/marginalia/keyword/model/DocumentKeywordsBuilder.java @@ -41,7 +41,7 @@ public class DocumentKeywordsBuilder { meta[i] = entry.getLongValue(); wordArray[i] = entry.getKey(); - positions[i] = GammaCodedSequence.generate(workArea, wordToPos.get(entry.getKey())); + positions[i] = GammaCodedSequence.generate(workArea, wordToPos.getOrDefault(entry.getKey(), IntList.of())); } return new DocumentKeywords(wordArray, meta, positions); diff --git a/code/features-convert/keyword-extraction/java/nu/marginalia/keyword/model/DocumentKeywordsPointer.java b/code/features-convert/keyword-extraction/java/nu/marginalia/keyword/model/DocumentKeywordsPointer.java deleted file mode 100644 index 960fff07..00000000 --- a/code/features-convert/keyword-extraction/java/nu/marginalia/keyword/model/DocumentKeywordsPointer.java +++ /dev/null @@ -1,48 +0,0 @@ -package nu.marginalia.keyword.model; - -import nu.marginalia.sequence.GammaCodedSequence; - -/** Pointer into a {@see DocumentKeywords}. It starts out before the first position, - * forward with advancePointer(). - * */ -public class DocumentKeywordsPointer { - private int pos = -1; - - private final DocumentKeywords keywords; - - DocumentKeywordsPointer(DocumentKeywords keywords) { - this.keywords = keywords; - } - - /** Number of positions remaining */ - public int remaining() { - return keywords.size() - Math.max(0, pos); - } - - /** Return the keyword associated with the current position */ - public String getKeyword() { - return keywords.keywords[pos]; - } - - /** Return the metadata associated with the current position */ - public long getMetadata() { - return keywords.metadata[pos]; - } - - /** Return the positions associated with the current position */ - public GammaCodedSequence getPositions() { - return keywords.positions[pos]; - } - - /** Advance the current position, - * returns false if this was the - * last position */ - public boolean advancePointer() { - return ++pos < keywords.size(); - } - - /** Returns true unless the pointer is beyond the last position in the keyword set */ - public boolean hasMore() { - return pos + 1 < keywords.size(); - } -} diff --git a/code/features-convert/keyword-extraction/test/nu/marginalia/keyword/DocumentKeywordExtractorTest.java b/code/features-convert/keyword-extraction/test/nu/marginalia/keyword/DocumentKeywordExtractorTest.java index ff064a21..2aafdc00 100644 --- a/code/features-convert/keyword-extraction/test/nu/marginalia/keyword/DocumentKeywordExtractorTest.java +++ b/code/features-convert/keyword-extraction/test/nu/marginalia/keyword/DocumentKeywordExtractorTest.java @@ -92,26 +92,17 @@ class DocumentKeywordExtractorTest { ); var keywordsBuilt = keywords.build(); - var ptr = keywordsBuilt.newPointer(); Map flags = new HashMap<>(); Map positions = new HashMap<>(); - ByteBuffer work = ByteBuffer.allocate(1024); + for (int i = 0; i < keywordsBuilt.size(); i++) { + String keyword = keywordsBuilt.keywords[i]; + long metadata = keywordsBuilt.metadata[i]; - while (ptr.advancePointer()) { - System.out.println(ptr.getKeyword() + " " + ptr.getMetadata() + " " + ptr.getPositions()); - - int[] vals = ptr.getPositions().decode().toIntArray(); - for (int i = 0; i < vals.length; i++) { - vals[i] = vals[i] + 1; - } - var out = EliasGammaCodec.encode(work, vals); - System.out.println(out.capacity() + "/" + vals.length * 4); - - if (Set.of("dirty", "blues").contains(ptr.getKeyword())) { - flags.put(ptr.getKeyword(), new WordMetadata(ptr.getMetadata())); - positions.put(ptr.getKeyword(), ptr.getPositions()); + if (Set.of("dirty", "blues").contains(keyword)) { + flags.put(keyword, new WordMetadata(metadata)); + positions.put(keyword, keywordsBuilt.positions[i]); } } diff --git a/code/index/build.gradle b/code/index/build.gradle index 574c27d8..2f1cde13 100644 --- a/code/index/build.gradle +++ b/code/index/build.gradle @@ -15,12 +15,14 @@ dependencies { implementation 'org.jgrapht:jgrapht-core:1.5.2' implementation project(':third-party:commons-codec') + implementation project(':third-party:parquet-floor') implementation project(':code:index:api') implementation project(':code:functions:link-graph:api') implementation project(':code:libraries:array') implementation project(':code:libraries:btree') + implementation project(':code:libraries:coded-sequence') implementation project(':code:common:db') implementation project(':code:common:config') diff --git a/code/index/index-forward/build.gradle b/code/index/index-forward/build.gradle index cf453e73..83e0cdc2 100644 --- a/code/index/index-forward/build.gradle +++ b/code/index/index-forward/build.gradle @@ -15,6 +15,7 @@ apply from: "$rootProject.projectDir/srcsets.gradle" dependencies { implementation project(':code:libraries:array') implementation project(':code:libraries:btree') + implementation project(':code:libraries:coded-sequence') implementation project(':code:index:query') implementation project(':code:index:index-journal') implementation project(':code:common:model') diff --git a/code/index/index-forward/test/nu/marginalia/index/forward/ForwardIndexConverterTest.java b/code/index/index-forward/test/nu/marginalia/index/forward/ForwardIndexConverterTest.java index de571664..b30f549f 100644 --- a/code/index/index-forward/test/nu/marginalia/index/forward/ForwardIndexConverterTest.java +++ b/code/index/index-forward/test/nu/marginalia/index/forward/ForwardIndexConverterTest.java @@ -2,12 +2,14 @@ package nu.marginalia.index.forward; import lombok.SneakyThrows; import nu.marginalia.index.domainrankings.DomainRankings; -import nu.marginalia.index.journal.model.IndexJournalEntry; +import nu.marginalia.index.journal.model.IndexJournalEntryData; +import nu.marginalia.index.journal.model.IndexJournalEntryHeader; import nu.marginalia.index.journal.reader.IndexJournalReaderSingleFile; import nu.marginalia.index.journal.writer.IndexJournalWriter; import nu.marginalia.index.journal.writer.IndexJournalWriterSingleFileImpl; import nu.marginalia.model.id.UrlIdCodec; import nu.marginalia.process.control.FakeProcessHeartbeat; +import nu.marginalia.sequence.GammaCodedSequence; import nu.marginalia.test.TestUtil; import org.junit.jupiter.api.AfterEach; import org.junit.jupiter.api.BeforeEach; @@ -69,40 +71,40 @@ class ForwardIndexConverterTest { TestUtil.clearTempDir(dataDir); } - public int[] getFactorsI(int id) { - return IntStream.rangeClosed(1, id).filter(v -> (id % v) == 0).toArray(); - } - long createId(long url, long domain) { return UrlIdCodec.encodeId((int) domain, (int) url); } public void createEntry(IndexJournalWriter writer, int id) { - int[] factors = getFactorsI(id); - - var entryBuilder = IndexJournalEntry.builder(createId(id, id/20), id%5); - - for (int i = 0; i+1 < factors.length; i+=2) { - entryBuilder.add(factors[i], -factors[i+1]); - } - - writer.put(entryBuilder.build()); + writer.put( + new IndexJournalEntryHeader(createId(id, id/20), + id%3, + (id % 5)), + new IndexJournalEntryData( + new String[]{}, + new long[]{}, + new GammaCodedSequence[]{} + ) + ); } @Test void testForwardIndex() throws IOException { - new ForwardIndexConverter(new FakeProcessHeartbeat(), new IndexJournalReaderSingleFile(indexFile), docsFileId, docsFileData, new DomainRankings()).convert(); + new ForwardIndexConverter(new FakeProcessHeartbeat(), + new IndexJournalReaderSingleFile(indexFile), + docsFileId, + docsFileData, + new DomainRankings()).convert(); var forwardReader = new ForwardIndexReader(docsFileId, docsFileData); for (int i = 36; i < workSetSize; i++) { long docId = createId(i, i/20); assertEquals(0x00FF000000000000L | (i % 5), forwardReader.getDocMeta(docId)); + assertEquals((i % 3), forwardReader.getHtmlFeatures(docId)); assertEquals(i/20, UrlIdCodec.getDomainId(docId)); } - } - } \ No newline at end of file diff --git a/code/index/index-journal/build.gradle b/code/index/index-journal/build.gradle index 5380c0be..7274b8b2 100644 --- a/code/index/index-journal/build.gradle +++ b/code/index/index-journal/build.gradle @@ -13,8 +13,11 @@ java { apply from: "$rootProject.projectDir/srcsets.gradle" dependencies { + implementation project(':code:libraries:coded-sequence') implementation project(':code:libraries:array') implementation project(':code:common:model') + implementation project(':third-party:parquet-floor') + implementation project(':third-party:commons-codec') implementation libs.bundles.slf4j @@ -23,6 +26,7 @@ dependencies { implementation libs.guava implementation libs.trove implementation libs.zstd + implementation libs.fastutil implementation libs.commons.lang3 implementation libs.roaringbitmap diff --git a/code/index/index-journal/java/nu/marginalia/index/journal/model/IndexJournalEntry.java b/code/index/index-journal/java/nu/marginalia/index/journal/model/IndexJournalEntry.java deleted file mode 100644 index 7d4944ac..00000000 --- a/code/index/index-journal/java/nu/marginalia/index/journal/model/IndexJournalEntry.java +++ /dev/null @@ -1,27 +0,0 @@ -package nu.marginalia.index.journal.model; - -import nu.marginalia.model.id.UrlIdCodec; - -/** An entry in the index journal. - * - * @param header the header of the entry, containing document level data - * @param data the data of the entry, containing keyword level data - * - * @see IndexJournalEntryHeader - * @see IndexJournalEntryData - */ -public record IndexJournalEntry(IndexJournalEntryHeader header, IndexJournalEntryData data) { - - public static IndexJournalEntryBuilder builder(long documentId, long documentMeta) { - return new IndexJournalEntryBuilder(0, documentId, documentMeta); - } - - public static IndexJournalEntryBuilder builder(int domainId, - int urlId, - long documentMeta) { - - - return builder(UrlIdCodec.encodeId(domainId, urlId), documentMeta); - } - -} diff --git a/code/index/index-journal/java/nu/marginalia/index/journal/model/IndexJournalEntryBuilder.java b/code/index/index-journal/java/nu/marginalia/index/journal/model/IndexJournalEntryBuilder.java deleted file mode 100644 index 6bfa19ea..00000000 --- a/code/index/index-journal/java/nu/marginalia/index/journal/model/IndexJournalEntryBuilder.java +++ /dev/null @@ -1,37 +0,0 @@ -package nu.marginalia.index.journal.model; - -import gnu.trove.list.array.TLongArrayList; - -public class IndexJournalEntryBuilder { - private final long documentId; - private final int documentFeatures; - private final long documentMeta; - private final TLongArrayList items = new TLongArrayList(); - - public IndexJournalEntryBuilder( - int documentFeatures, - long documentId, - long documentMeta) { - this.documentFeatures = documentFeatures; - this.documentId = documentId; - this.documentMeta = documentMeta; - } - - public IndexJournalEntryBuilder add(long wordId, long metadata) { - - items.add(wordId); - items.add(metadata); - - return this; - } - - public IndexJournalEntry build() { - return new IndexJournalEntry( - new IndexJournalEntryHeader(items.size(), - documentFeatures, - documentId, - documentMeta), - new IndexJournalEntryData(items.toArray()) - ); - } -} diff --git a/code/index/index-journal/java/nu/marginalia/index/journal/model/IndexJournalEntryData.java b/code/index/index-journal/java/nu/marginalia/index/journal/model/IndexJournalEntryData.java index 26c10c2a..71ef1d2a 100644 --- a/code/index/index-journal/java/nu/marginalia/index/journal/model/IndexJournalEntryData.java +++ b/code/index/index-journal/java/nu/marginalia/index/journal/model/IndexJournalEntryData.java @@ -1,77 +1,36 @@ package nu.marginalia.index.journal.model; -import nu.marginalia.index.journal.reader.IndexJournalReader; -import nu.marginalia.model.idx.WordMetadata; +import nu.marginalia.hash.MurmurHash3_128; +import nu.marginalia.sequence.GammaCodedSequence; -import java.util.Arrays; -import java.util.Iterator; +public record IndexJournalEntryData(long[] termIds, + long[] metadata, + GammaCodedSequence[] positions) { -/** The keyword data of an index journal entry. - * The data itself is an interleaved array of - * word ids and metadata. - *

- * Odd entries are term ids, even entries are encoded WordMetadata records. - *

- *

The civilized way of reading the journal data is to use an IndexJournalReader

- * - * @see WordMetadata - * @see IndexJournalReader - */ -public class IndexJournalEntryData implements Iterable { - private final int size; - public final long[] underlyingArray; - - public static final int MAX_LENGTH = 1000; - public static final int ENTRY_SIZE = 2; - - public IndexJournalEntryData(long[] underlyingArray) { - this.size = underlyingArray.length; - this.underlyingArray = underlyingArray; + public IndexJournalEntryData { + assert termIds.length == metadata.length; + assert termIds.length == positions.length; } - public IndexJournalEntryData(int size, long[] underlyingArray) { - this.size = size; - this.underlyingArray = underlyingArray; + public IndexJournalEntryData(String[] keywords, + long[] metadata, + GammaCodedSequence[] positions) + { + this(termIds(keywords), metadata, positions); } - public long get(int idx) { - if (idx >= size) - throw new ArrayIndexOutOfBoundsException(idx + " vs " + size); - return underlyingArray[idx]; - } + private static final MurmurHash3_128 hash = new MurmurHash3_128(); public int size() { - return size; - } - public long[] toArray() { - if (size == underlyingArray.length) - return underlyingArray; - else - return Arrays.copyOf(underlyingArray, size); + return termIds.length; } - public String toString() { - return String.format("%s[%s]", getClass().getSimpleName(), Arrays.toString(toArray())); - } - public Iterator iterator() { - return new EntryIterator(); - } - - private class EntryIterator implements Iterator { - int pos = -ENTRY_SIZE; - - public boolean hasNext() { - return pos + 2*ENTRY_SIZE - 1 < size; - } - - @Override - public Record next() { - pos+=ENTRY_SIZE; - - return new Record(underlyingArray[pos], underlyingArray[pos+1]); + private static long[] termIds(String[] keywords) { + long[] termIds = new long[keywords.length]; + for (int i = 0; i < keywords.length; i++) { + termIds[i] = hash.hashKeyword(keywords[i]); } + return termIds; } - - public record Record(long wordId, long metadata) {} } diff --git a/code/index/index-journal/java/nu/marginalia/index/journal/model/IndexJournalEntryTermData.java b/code/index/index-journal/java/nu/marginalia/index/journal/model/IndexJournalEntryTermData.java new file mode 100644 index 00000000..c9de3da1 --- /dev/null +++ b/code/index/index-journal/java/nu/marginalia/index/journal/model/IndexJournalEntryTermData.java @@ -0,0 +1,20 @@ +package nu.marginalia.index.journal.model; + +import nu.marginalia.sequence.GammaCodedSequence; + +/** Data corresponding to a term in a document in the index journal. + * + * @param termId the id of the term + * @param metadata the metadata of the term + * @param positions the positions of the word in the document, gamma coded + * + * @see GammaCodedSequence + */ +public record IndexJournalEntryTermData( + long termId, + long metadata, + GammaCodedSequence positions) +{ + + +} diff --git a/code/index/index-journal/java/nu/marginalia/index/journal/reader/IndexJournalReadEntry.java b/code/index/index-journal/java/nu/marginalia/index/journal/reader/IndexJournalReadEntry.java index 625267d1..0f3a6ff2 100644 --- a/code/index/index-journal/java/nu/marginalia/index/journal/reader/IndexJournalReadEntry.java +++ b/code/index/index-journal/java/nu/marginalia/index/journal/reader/IndexJournalReadEntry.java @@ -1,35 +1,29 @@ package nu.marginalia.index.journal.reader; -import nu.marginalia.index.journal.model.IndexJournalEntryData; import nu.marginalia.index.journal.model.IndexJournalEntryHeader; +import nu.marginalia.index.journal.model.IndexJournalEntryTermData; import nu.marginalia.model.id.UrlIdCodec; +import nu.marginalia.sequence.GammaCodedSequence; import java.io.DataInputStream; import java.io.IOException; import java.nio.ByteBuffer; -import java.nio.LongBuffer; +import java.util.Iterator; -public class IndexJournalReadEntry { +public class IndexJournalReadEntry implements Iterable { public final IndexJournalEntryHeader header; - private final long[] buffer; + private final ByteBuffer buffer; + private final int initialPos; - public IndexJournalReadEntry(IndexJournalEntryHeader header, long[] buffer) { + public IndexJournalReadEntry(IndexJournalEntryHeader header, ByteBuffer buffer) { this.header = header; this.buffer = buffer; + this.initialPos = buffer.position(); } - record WorkArea(byte[] bytes, LongBuffer buffer) { - WorkArea(byte[] bytes) { - this(bytes, ByteBuffer.wrap(bytes).asLongBuffer()); - } - WorkArea() { - this(new byte[8*65536]); - } - } - - static ThreadLocal pool = ThreadLocal.withInitial(WorkArea::new); + static ThreadLocal pool = ThreadLocal.withInitial(() -> ByteBuffer.allocate(8*65536)); public static IndexJournalReadEntry read(DataInputStream inputStream) throws IOException { @@ -44,13 +38,11 @@ public class IndexJournalReadEntry { meta); var workArea = pool.get(); - inputStream.readFully(workArea.bytes, 0, 8 * header.entrySize()); - - long[] out = new long[header.entrySize()]; - workArea.buffer.get(0, out, 0, out.length); - - return new IndexJournalReadEntry(header, out); + inputStream.readFully(workArea.array(), 0, header.entrySize()); + workArea.position(0); + workArea.limit(header.entrySize()); + return new IndexJournalReadEntry(header, workArea); } public long docId() { @@ -61,12 +53,54 @@ public class IndexJournalReadEntry { return header.documentMeta(); } + public int documentFeatures() { + return header.documentFeatures(); + } + public int domainId() { return UrlIdCodec.getDomainId(docId()); } - public IndexJournalEntryData readEntry() { - return new IndexJournalEntryData(header.entrySize(), buffer); + public void reset() { + buffer.position(initialPos); + } + + public Iterator iterator() { + return new TermDataIterator(buffer, initialPos); } } + +class TermDataIterator implements Iterator { + private final ByteBuffer buffer; + + TermDataIterator(ByteBuffer buffer, int initialPos) { + this.buffer = buffer; + this.buffer.position(initialPos); + } + + @Override + public boolean hasNext() { + return buffer.position() < buffer.limit(); + } + + @Override + public IndexJournalEntryTermData next() { + // read the metadata for the term + long termId = buffer.getLong(); + long meta = buffer.getLong(); + + // read the size of the sequence data + int size = buffer.get() & 0xFF; + + // slice the buffer to get the sequence data + var slice = buffer.slice(buffer.position(), size); + var sequence = new GammaCodedSequence(slice); + + // advance the buffer position to the next term + buffer.position(buffer.position() + size); + + return new IndexJournalEntryTermData(termId, meta, sequence); + } + +} \ No newline at end of file diff --git a/code/index/index-journal/java/nu/marginalia/index/journal/reader/IndexJournalReader.java b/code/index/index-journal/java/nu/marginalia/index/journal/reader/IndexJournalReader.java index 14e686b3..2f57da61 100644 --- a/code/index/index-journal/java/nu/marginalia/index/journal/reader/IndexJournalReader.java +++ b/code/index/index-journal/java/nu/marginalia/index/journal/reader/IndexJournalReader.java @@ -12,6 +12,9 @@ public interface IndexJournalReader { int FILE_HEADER_SIZE_LONGS = 2; int FILE_HEADER_SIZE_BYTES = 8 * FILE_HEADER_SIZE_LONGS; + int DOCUMENT_HEADER_SIZE_BYTES = 24; + int TERM_HEADER_SIZE_BYTES = 17; + /** Create a reader for a single file. */ static IndexJournalReader singleFile(Path fileName) throws IOException { return new IndexJournalReaderSingleFile(fileName); @@ -25,22 +28,23 @@ public interface IndexJournalReader { default void forEachWordId(LongConsumer consumer) { var ptr = this.newPointer(); while (ptr.nextDocument()) { - while (ptr.nextRecord()) { - consumer.accept(ptr.wordId()); + for (var termData : ptr) { + consumer.accept(termData.termId()); } } } - default void forEachDocId(LongConsumer consumer) { - var ptr = this.newPointer(); - while (ptr.nextDocument()) { - consumer.accept(ptr.documentId()); + default void forEachDocId(LongConsumer consumer) throws IOException { + try (var ptr = this.newPointer()) { + while (ptr.nextDocument()) { + consumer.accept(ptr.documentId()); + } } } /** Create a new pointer to the journal. The IndexJournalPointer is * a two-tiered iterator that allows both iteration over document records - * and their keywords + * and the terms within each document. */ IndexJournalPointer newPointer(); diff --git a/code/index/index-journal/java/nu/marginalia/index/journal/reader/IndexJournalReaderPagingImpl.java b/code/index/index-journal/java/nu/marginalia/index/journal/reader/IndexJournalReaderPagingImpl.java index d5ba23b8..8a4361fa 100644 --- a/code/index/index-journal/java/nu/marginalia/index/journal/reader/IndexJournalReaderPagingImpl.java +++ b/code/index/index-journal/java/nu/marginalia/index/journal/reader/IndexJournalReaderPagingImpl.java @@ -16,12 +16,15 @@ public class IndexJournalReaderPagingImpl implements IndexJournalReader { private final List readers; public IndexJournalReaderPagingImpl(Path baseDir) throws IOException { - var inputFiles = IndexJournalFileNames.findJournalFiles(baseDir); - if (inputFiles.isEmpty()) + this(IndexJournalFileNames.findJournalFiles(baseDir)); + + if (readers.isEmpty()) logger.warn("Creating paging index journal file in {}, found no inputs!", baseDir); else - logger.info("Creating paging index journal reader for {} inputs", inputFiles.size()); + logger.info("Creating paging index journal reader for {} inputs", readers.size()); + } + public IndexJournalReaderPagingImpl(List inputFiles) throws IOException { this.readers = new ArrayList<>(inputFiles.size()); for (var inputFile : inputFiles) { diff --git a/code/index/index-journal/java/nu/marginalia/index/journal/reader/IndexJournalReaderSingleFile.java b/code/index/index-journal/java/nu/marginalia/index/journal/reader/IndexJournalReaderSingleFile.java index a131a788..488d0dc6 100644 --- a/code/index/index-journal/java/nu/marginalia/index/journal/reader/IndexJournalReaderSingleFile.java +++ b/code/index/index-journal/java/nu/marginalia/index/journal/reader/IndexJournalReaderSingleFile.java @@ -2,18 +2,20 @@ package nu.marginalia.index.journal.reader; import com.github.luben.zstd.ZstdInputStream; import lombok.SneakyThrows; -import nu.marginalia.index.journal.model.IndexJournalEntryData; +import nu.marginalia.index.journal.model.IndexJournalEntryTermData; import nu.marginalia.index.journal.model.IndexJournalFileHeader; import nu.marginalia.index.journal.reader.pointer.IndexJournalPointer; +import org.jetbrains.annotations.NotNull; import java.io.*; import java.nio.file.Files; import java.nio.file.Path; import java.nio.file.StandardOpenOption; +import java.util.Iterator; public class IndexJournalReaderSingleFile implements IndexJournalReader { - private Path journalFile; + private final Path journalFile; public final IndexJournalFileHeader fileHeader; @Override @@ -58,8 +60,6 @@ class SingleFileJournalPointer implements IndexJournalPointer { private final IndexJournalFileHeader fileHeader; private final DataInputStream dataInputStream; private IndexJournalReadEntry entry; - private IndexJournalEntryData entryData; - private int recordIdx = -2; private int docIdx = -1; public SingleFileJournalPointer( @@ -73,9 +73,6 @@ class SingleFileJournalPointer implements IndexJournalPointer { @SneakyThrows @Override public boolean nextDocument() { - recordIdx = -2; - entryData = null; - if (++docIdx < fileHeader.fileSizeRecords()) { entry = IndexJournalReadEntry.read(dataInputStream); return true; @@ -86,19 +83,6 @@ class SingleFileJournalPointer implements IndexJournalPointer { return false; } - @Override - public boolean nextRecord() { - if (entryData == null) { - entryData = entry.readEntry(); - } - - recordIdx += 2; - if (recordIdx < entryData.size()) { - return true; - } - return false; - } - @Override public long documentId() { return entry.docId(); @@ -109,22 +93,21 @@ class SingleFileJournalPointer implements IndexJournalPointer { return entry.docMeta(); } + @Override - public long wordId() { - return entryData.get(recordIdx); + public int documentFeatures() { return entry.documentFeatures(); } + + /** Return an iterator over the terms in the current document. + * This iterator is not valid after calling nextDocument(). + */ + @NotNull + @Override + public Iterator iterator() { + return entry.iterator(); } @Override - public long wordMeta() { - return entryData.get(recordIdx + 1); - } - - @Override - public int documentFeatures() { - if (entryData == null) { - entryData = entry.readEntry(); - } - - return entry.header.documentFeatures(); + public void close() throws IOException { + dataInputStream.close(); } } \ No newline at end of file diff --git a/code/index/index-journal/java/nu/marginalia/index/journal/reader/pointer/IndexJournalPointer.java b/code/index/index-journal/java/nu/marginalia/index/journal/reader/pointer/IndexJournalPointer.java index 37100335..59e65e27 100644 --- a/code/index/index-journal/java/nu/marginalia/index/journal/reader/pointer/IndexJournalPointer.java +++ b/code/index/index-journal/java/nu/marginalia/index/journal/reader/pointer/IndexJournalPointer.java @@ -1,5 +1,10 @@ package nu.marginalia.index.journal.reader.pointer; +import nu.marginalia.index.journal.model.IndexJournalEntryTermData; +import org.jetbrains.annotations.NotNull; + +import java.io.IOException; +import java.util.Iterator; import java.util.function.LongPredicate; /** @@ -13,7 +18,7 @@ import java.util.function.LongPredicate; * nextDocument() will move the pointer from doc1 to doc2;
* nextRecord() will move the pointer from word1 to word2...
*/ -public interface IndexJournalPointer { +public interface IndexJournalPointer extends Iterable, AutoCloseable { /** * Advance to the next document in the journal, * returning true if such a document exists. @@ -22,11 +27,6 @@ public interface IndexJournalPointer { */ boolean nextDocument(); - /** - * Advance to the next record in the journal - */ - boolean nextRecord(); - /** * Get the id associated with the current document */ @@ -37,16 +37,6 @@ public interface IndexJournalPointer { */ long documentMeta(); - /** - * Get the wordId associated with the current record - */ - long wordId(); - - /** - * Get the termMeta associated with the current record - */ - long wordMeta(); - /** * Get the documentFeatures associated with the current record */ @@ -64,6 +54,8 @@ public interface IndexJournalPointer { default IndexJournalPointer filterWordMeta(LongPredicate filter) { return new FilteringJournalPointer(this, filter); } + + void close() throws IOException; } class JoiningJournalPointer implements IndexJournalPointer { @@ -86,11 +78,6 @@ class JoiningJournalPointer implements IndexJournalPointer { return false; } - @Override - public boolean nextRecord() { - return pointers[pIndex].nextRecord(); - } - @Override public long documentId() { return pointers[pIndex].documentId(); @@ -101,20 +88,28 @@ class JoiningJournalPointer implements IndexJournalPointer { return pointers[pIndex].documentMeta(); } - @Override - public long wordId() { - return pointers[pIndex].wordId(); - } - - @Override - public long wordMeta() { - return pointers[pIndex].wordMeta(); - } @Override public int documentFeatures() { return pointers[pIndex].documentFeatures(); } + + @NotNull + @Override + public Iterator iterator() { + return pointers[pIndex].iterator(); + } + + public void close() { + for (var p : pointers) { + try { + p.close(); + } catch (Exception e) { + e.printStackTrace(); + } + } + + } } class FilteringJournalPointer implements IndexJournalPointer { @@ -128,14 +123,10 @@ class FilteringJournalPointer implements IndexJournalPointer { @Override public boolean nextDocument() { - return base.nextDocument(); - } - - @Override - public boolean nextRecord() { - while (base.nextRecord()) { - if (filter.test(wordMeta())) + while (base.nextDocument()) { + if (iterator().hasNext()) { return true; + } } return false; } @@ -150,18 +141,49 @@ class FilteringJournalPointer implements IndexJournalPointer { return base.documentMeta(); } - @Override - public long wordId() { - return base.wordId(); - } - - @Override - public long wordMeta() { - return base.wordMeta(); - } - @Override public int documentFeatures() { return base.documentFeatures(); } + + @NotNull + @Override + public Iterator iterator() { + + return new Iterator<>() { + private final Iterator baseIter = base.iterator(); + private IndexJournalEntryTermData value = null; + + @Override + public boolean hasNext() { + if (value != null) { + return true; + } + while (baseIter.hasNext()) { + value = baseIter.next(); + if (filter.test(value.metadata())) { + return true; + } + } + value = null; + return false; + } + + @Override + public IndexJournalEntryTermData next() { + if (hasNext()) { + var ret = value; + value = null; + return ret; + } else { + throw new IllegalStateException("No more elements"); + } + } + }; + } + + @Override + public void close() throws IOException { + base.close(); + } } \ No newline at end of file diff --git a/code/index/index-journal/java/nu/marginalia/index/journal/writer/IndexJournalWriter.java b/code/index/index-journal/java/nu/marginalia/index/journal/writer/IndexJournalWriter.java index 9d6966ef..df9b6836 100644 --- a/code/index/index-journal/java/nu/marginalia/index/journal/writer/IndexJournalWriter.java +++ b/code/index/index-journal/java/nu/marginalia/index/journal/writer/IndexJournalWriter.java @@ -1,8 +1,8 @@ package nu.marginalia.index.journal.writer; -import nu.marginalia.index.journal.model.IndexJournalEntry; import nu.marginalia.index.journal.model.IndexJournalEntryData; import nu.marginalia.index.journal.model.IndexJournalEntryHeader; +import nu.marginalia.index.journal.model.IndexJournalEntryTermData; import java.io.IOException; @@ -12,18 +12,7 @@ import java.io.IOException; * @see IndexJournalWriterPagingImpl */ public interface IndexJournalWriter extends AutoCloseable { - /** Write an entry to the journal. - * - * @param header the header of the entry - * @param entry the data of the entry - * - * @return the number of bytes written - */ - int put(IndexJournalEntryHeader header, IndexJournalEntryData entry); - default int put(IndexJournalEntry entry) { - return put(entry.header(), entry.data()); - } - void close() throws IOException; + int put(IndexJournalEntryHeader header, IndexJournalEntryData data); } diff --git a/code/index/index-journal/java/nu/marginalia/index/journal/writer/IndexJournalWriterPagingImpl.java b/code/index/index-journal/java/nu/marginalia/index/journal/writer/IndexJournalWriterPagingImpl.java index 81d9de1e..919a8326 100644 --- a/code/index/index-journal/java/nu/marginalia/index/journal/writer/IndexJournalWriterPagingImpl.java +++ b/code/index/index-journal/java/nu/marginalia/index/journal/writer/IndexJournalWriterPagingImpl.java @@ -49,13 +49,14 @@ public class IndexJournalWriterPagingImpl implements IndexJournalWriter { @Override @SneakyThrows - public int put(IndexJournalEntryHeader header, IndexJournalEntryData entry) { + public int put(IndexJournalEntryHeader header, IndexJournalEntryData data) + { if (bytesWritten >= sizeLimitBytes) { bytesWritten = 0; switchToNextWriter(); } - int writtenNow = currentWriter.put(header, entry); + int writtenNow = currentWriter.put(header, data); bytesWritten += writtenNow; return writtenNow; diff --git a/code/index/index-journal/java/nu/marginalia/index/journal/writer/IndexJournalWriterSingleFileImpl.java b/code/index/index-journal/java/nu/marginalia/index/journal/writer/IndexJournalWriterSingleFileImpl.java index beadb30a..59999138 100644 --- a/code/index/index-journal/java/nu/marginalia/index/journal/writer/IndexJournalWriterSingleFileImpl.java +++ b/code/index/index-journal/java/nu/marginalia/index/journal/writer/IndexJournalWriterSingleFileImpl.java @@ -2,8 +2,9 @@ package nu.marginalia.index.journal.writer; import com.github.luben.zstd.ZstdDirectBufferCompressingStream; import lombok.SneakyThrows; -import nu.marginalia.index.journal.model.IndexJournalEntryData; +import nu.marginalia.hash.MurmurHash3_128; import nu.marginalia.index.journal.model.IndexJournalEntryHeader; +import nu.marginalia.index.journal.model.IndexJournalEntryData; import nu.marginalia.index.journal.reader.IndexJournalReader; import org.slf4j.Logger; import org.slf4j.LoggerFactory; @@ -22,6 +23,8 @@ public class IndexJournalWriterSingleFileImpl implements IndexJournalWriter{ private static final int ZSTD_BUFFER_SIZE = 8192; private static final int DATA_BUFFER_SIZE = 8192; + private final MurmurHash3_128 hasher = new MurmurHash3_128(); + private final ByteBuffer dataBuffer = ByteBuffer.allocateDirect(DATA_BUFFER_SIZE); private final ZstdDirectBufferCompressingStream compressingStream; @@ -75,36 +78,48 @@ public class IndexJournalWriterSingleFileImpl implements IndexJournalWriter{ @Override @SneakyThrows - public int put(IndexJournalEntryHeader header, IndexJournalEntryData entry) { + public int put(IndexJournalEntryHeader header, + IndexJournalEntryData data) + { if (dataBuffer.capacity() - dataBuffer.position() < 3*8) { dataBuffer.flip(); compressingStream.compress(dataBuffer); dataBuffer.clear(); } - dataBuffer.putInt(entry.size()); + final long[] keywords = data.termIds(); + final long[] metadata = data.metadata(); + final var positions = data.positions(); + + int recordSize = 0; // document header size is 3 longs + for (int i = 0; i < keywords.length; i++) { + // term header size is 2 longs + recordSize += IndexJournalReader.TERM_HEADER_SIZE_BYTES + positions[i].size(); + } + + dataBuffer.putInt(recordSize); dataBuffer.putInt(header.documentFeatures()); dataBuffer.putLong(header.combinedId()); dataBuffer.putLong(header.documentMeta()); - for (int i = 0; i < entry.size(); ) { - int remaining = (dataBuffer.capacity() - dataBuffer.position()) / 8; - if (remaining <= 0) { + for (int i = 0; i < keywords.length; i++) { + int requiredSize = IndexJournalReader.TERM_HEADER_SIZE_BYTES + positions[i].size(); + + if (dataBuffer.capacity() - dataBuffer.position() < requiredSize) { dataBuffer.flip(); compressingStream.compress(dataBuffer); dataBuffer.clear(); } - else while (remaining-- > 0 && i < entry.size()) { - dataBuffer.putLong(entry.underlyingArray[i++]); - } + dataBuffer.putLong(keywords[i]); + dataBuffer.putLong(metadata[i]); + dataBuffer.put((byte) positions[i].size()); + dataBuffer.put(positions[i].buffer()); } numEntries++; - final int bytesWritten = 8 * ( /*header = 3 longs */ 3 + entry.size()); - - return bytesWritten; + return recordSize; } public void close() throws IOException { @@ -121,7 +136,7 @@ public class IndexJournalWriterSingleFileImpl implements IndexJournalWriter{ // Finalize the file by writing a header in the beginning - ByteBuffer header = ByteBuffer.allocate(16); + ByteBuffer header = ByteBuffer.allocate(IndexJournalReader.FILE_HEADER_SIZE_BYTES); header.putLong(numEntries); header.putLong(0); // reserved for future use header.flip(); diff --git a/code/index/index-journal/test/nu/marginalia/index/journal/IndexJournalTest.java b/code/index/index-journal/test/nu/marginalia/index/journal/IndexJournalTest.java index 47e8ac7f..67a60ed4 100644 --- a/code/index/index-journal/test/nu/marginalia/index/journal/IndexJournalTest.java +++ b/code/index/index-journal/test/nu/marginalia/index/journal/IndexJournalTest.java @@ -1,6 +1,5 @@ package nu.marginalia.index.journal; -import nu.marginalia.index.journal.model.IndexJournalEntry; import nu.marginalia.index.journal.reader.IndexJournalReader; import nu.marginalia.index.journal.reader.IndexJournalReaderSingleFile; import nu.marginalia.index.journal.writer.IndexJournalWriterSingleFileImpl; @@ -18,52 +17,52 @@ import java.util.List; import static org.junit.jupiter.api.Assertions.assertEquals; public class IndexJournalTest { - Path tempFile; - IndexJournalReader reader; - - long firstDocId = UrlIdCodec.encodeId(44, 10); - long secondDocId = UrlIdCodec.encodeId(43, 15); - - @BeforeEach - public void setUp() throws IOException { - tempFile = Files.createTempFile(getClass().getSimpleName(), ".dat"); - - var journalWriter = new IndexJournalWriterSingleFileImpl( tempFile); - journalWriter.put(IndexJournalEntry.builder(44, 10, 55) - .add(1, 2) - .add(2, 3) - .add(3, 4) - .add(5, 6).build()); - - journalWriter.put(IndexJournalEntry.builder(43, 15, 10) - .add(5, 5) - .add(6, 6) - .build()); - journalWriter.close(); - - reader = new IndexJournalReaderSingleFile(tempFile); - } - @AfterEach - public void tearDown() throws IOException { - Files.delete(tempFile); - } - - @Test - public void forEachDocId() { - List expected = List.of(firstDocId, secondDocId); - List actual = new ArrayList<>(); - - reader.forEachDocId(actual::add); - assertEquals(expected, actual); - } - - @Test - public void forEachWordId() { - List expected = List.of(1, 2, 3, 5, 5 ,6); - List actual = new ArrayList<>(); - - reader.forEachWordId(i -> actual.add((int) i)); - assertEquals(expected, actual); - } +// Path tempFile; +// IndexJournalReader reader; +// +// long firstDocId = UrlIdCodec.encodeId(44, 10); +// long secondDocId = UrlIdCodec.encodeId(43, 15); +// +// @BeforeEach +// public void setUp() throws IOException { +// tempFile = Files.createTempFile(getClass().getSimpleName(), ".dat"); +// +// var journalWriter = new IndexJournalWriterSingleFileImpl( tempFile); +// journalWriter.put(IndexJournalEntry.builder(44, 10, 55) +// .add(1, 2) +// .add(2, 3) +// .add(3, 4) +// .add(5, 6).build()); +// +// journalWriter.put(IndexJournalEntry.builder(43, 15, 10) +// .add(5, 5) +// .add(6, 6) +// .build()); +// journalWriter.close(); +// +// reader = new IndexJournalReaderSingleFile(tempFile); +// } +// @AfterEach +// public void tearDown() throws IOException { +// Files.delete(tempFile); +// } +// +// @Test +// public void forEachDocId() { +// List expected = List.of(firstDocId, secondDocId); +// List actual = new ArrayList<>(); +// +// reader.forEachDocId(actual::add); +// assertEquals(expected, actual); +// } +// +// @Test +// public void forEachWordId() { +// List expected = List.of(1, 2, 3, 5, 5 ,6); +// List actual = new ArrayList<>(); +// +// reader.forEachWordId(i -> actual.add((int) i)); +// assertEquals(expected, actual); +// } } diff --git a/code/index/index-journal/test/nu/marginalia/index/journal/IndexJournalWriterTest.java b/code/index/index-journal/test/nu/marginalia/index/journal/IndexJournalWriterTest.java new file mode 100644 index 00000000..b9cd49c1 --- /dev/null +++ b/code/index/index-journal/test/nu/marginalia/index/journal/IndexJournalWriterTest.java @@ -0,0 +1,367 @@ +package nu.marginalia.index.journal; + +import it.unimi.dsi.fastutil.ints.IntList; +import nu.marginalia.hash.MurmurHash3_128; +import nu.marginalia.index.journal.model.IndexJournalEntryData; +import nu.marginalia.index.journal.model.IndexJournalEntryHeader; +import nu.marginalia.index.journal.model.IndexJournalEntryTermData; +import nu.marginalia.index.journal.reader.IndexJournalReaderPagingImpl; +import nu.marginalia.index.journal.writer.IndexJournalWriterSingleFileImpl; +import nu.marginalia.index.journal.reader.IndexJournalReaderSingleFile; +import nu.marginalia.sequence.GammaCodedSequence; +import org.junit.jupiter.api.AfterEach; +import org.junit.jupiter.api.Assertions; +import org.junit.jupiter.api.BeforeEach; +import org.junit.jupiter.api.Test; + +import java.io.IOException; +import java.nio.ByteBuffer; +import java.nio.file.Files; +import java.nio.file.Path; +import java.util.Iterator; +import java.util.List; + +import static org.junit.jupiter.api.Assertions.*; + +public class IndexJournalWriterTest { + Path tempFile; + Path tempFile2; + ByteBuffer workArea = ByteBuffer.allocate(1024); + + @BeforeEach + public void setUp() throws IOException { + tempFile = Files.createTempFile(getClass().getSimpleName(), ".dat"); + tempFile2 = Files.createTempFile(getClass().getSimpleName(), ".dat"); + } + @AfterEach + public void tearDown() throws IOException { + Files.delete(tempFile); + Files.delete(tempFile2); + } + + private GammaCodedSequence gcs(int... values) { + return GammaCodedSequence.generate(workArea, values); + } + + static MurmurHash3_128 hasher = new MurmurHash3_128(); + static long wordId(String str) { + return hasher.hashKeyword(str); + } + + @Test + public void testSingleFile() { + try (var writer = new IndexJournalWriterSingleFileImpl(tempFile)) { + // Write two documents with two terms each + writer.put(new IndexJournalEntryHeader(11, 22, 33), + new IndexJournalEntryData( + new String[]{"word1", "word2"}, + new long[]{44, 55}, + new GammaCodedSequence[]{ + gcs(1, 3, 5), + gcs(2, 4, 6), + }) + ); + writer.put(new IndexJournalEntryHeader(12, 23, 34), + new IndexJournalEntryData( + new String[]{"word1", "word2"}, + new long[]{45, 56}, + new GammaCodedSequence[]{ + gcs(2, 4, 6), + gcs(3, 5, 7), + }) + ); + } + catch (IOException ex) { + Assertions.fail(ex); + } + + // Read the journal back + + try { + var reader = new IndexJournalReaderSingleFile(tempFile); + + Iterator iter; + IndexJournalEntryTermData termData; + + try (var ptr = reader.newPointer()) { + + /** DOCUMENT 1 */ + assertTrue(ptr.nextDocument()); + assertEquals(11, ptr.documentId()); + assertEquals(22, ptr.documentFeatures()); + assertEquals(33, ptr.documentMeta()); + + iter = ptr.iterator(); + + // Term 1 + assertTrue(iter.hasNext()); + termData = iter.next(); + assertEquals(wordId("word1"), termData.termId()); + assertEquals(44, termData.metadata()); + assertEquals(IntList.of(1, 3, 5), termData.positions().values()); + + // Term 2 + assertTrue(iter.hasNext()); + termData = iter.next(); + assertEquals(wordId("word2"), termData.termId()); + assertEquals(55, termData.metadata()); + assertEquals(IntList.of(2, 4, 6), termData.positions().values()); + + // No more terms + + assertFalse(iter.hasNext()); + + /** DOCUMENT 2 */ + assertTrue(ptr.nextDocument()); + assertEquals(12, ptr.documentId()); + assertEquals(23, ptr.documentFeatures()); + assertEquals(34, ptr.documentMeta()); + + iter = ptr.iterator(); + // Term 1 + assertTrue(iter.hasNext()); + termData = iter.next(); + assertEquals(wordId("word1"), termData.termId()); + assertEquals(45, termData.metadata()); + assertEquals(IntList.of(2, 4, 6), termData.positions().values()); + + // Term 2 + assertTrue(iter.hasNext()); + termData = iter.next(); + assertEquals(wordId("word2"), termData.termId()); + assertEquals(56, termData.metadata()); + assertEquals(IntList.of(3, 5, 7), termData.positions().values()); + + // No more terms + assertFalse(iter.hasNext()); + + // No more documents + assertFalse(ptr.nextDocument()); + } + } + catch (IOException ex) { + Assertions.fail(ex); + } + } + + @Test + public void testMultiFile() { + try (var writer = new IndexJournalWriterSingleFileImpl(tempFile)) { + writer.put(new IndexJournalEntryHeader(11, 22, 33), + new IndexJournalEntryData( + new String[]{"word1", "word2"}, + new long[]{44, 55}, + new GammaCodedSequence[]{ + gcs(1, 3, 5), + gcs(2, 4, 6), + }) + ); + } + catch (IOException ex) { + Assertions.fail(ex); + } + + try (var writer = new IndexJournalWriterSingleFileImpl(tempFile2)) { + writer.put(new IndexJournalEntryHeader(12, 23, 34), + new IndexJournalEntryData( + new String[]{"word1", "word2"}, + new long[]{45, 56}, + new GammaCodedSequence[]{ + gcs(2, 4, 6), + gcs(3, 5, 7), + }) + ); + } + catch (IOException ex) { + Assertions.fail(ex); + } + + // Read the journal back + + try { + var reader = new IndexJournalReaderPagingImpl(List.of(tempFile, tempFile2)); + + Iterator iter; + IndexJournalEntryTermData termData; + + try (var ptr = reader.newPointer()) { + + /** DOCUMENT 1 */ + assertTrue(ptr.nextDocument()); + assertEquals(11, ptr.documentId()); + assertEquals(22, ptr.documentFeatures()); + assertEquals(33, ptr.documentMeta()); + + iter = ptr.iterator(); + + // Term 1 + assertTrue(iter.hasNext()); + termData = iter.next(); + assertEquals(wordId("word1"), termData.termId()); + assertEquals(44, termData.metadata()); + assertEquals(IntList.of(1, 3, 5), termData.positions().values()); + + // Term 2 + assertTrue(iter.hasNext()); + termData = iter.next(); + assertEquals(wordId("word2"), termData.termId()); + assertEquals(55, termData.metadata()); + assertEquals(IntList.of(2, 4, 6), termData.positions().values()); + + // No more terms + + assertFalse(iter.hasNext()); + + /** DOCUMENT 2 */ + assertTrue(ptr.nextDocument()); + assertEquals(12, ptr.documentId()); + assertEquals(23, ptr.documentFeatures()); + assertEquals(34, ptr.documentMeta()); + + iter = ptr.iterator(); + // Term 1 + assertTrue(iter.hasNext()); + termData = iter.next(); + assertEquals(wordId("word1"), termData.termId()); + assertEquals(45, termData.metadata()); + assertEquals(IntList.of(2, 4, 6), termData.positions().values()); + + // Term 2 + assertTrue(iter.hasNext()); + termData = iter.next(); + assertEquals(wordId("word2"), termData.termId()); + assertEquals(56, termData.metadata()); + assertEquals(IntList.of(3, 5, 7), termData.positions().values()); + + // No more terms + assertFalse(iter.hasNext()); + + // No more documents + assertFalse(ptr.nextDocument()); + } + } + catch (IOException ex) { + Assertions.fail(ex); + } + } + + @Test + public void testSingleFileIterTwice() { + try (var writer = new IndexJournalWriterSingleFileImpl(tempFile)) { + // Write two documents with two terms each + writer.put(new IndexJournalEntryHeader(11, 22, 33), + new IndexJournalEntryData( + new String[]{"word1", "word2"}, + new long[]{44, 55}, + new GammaCodedSequence[]{ + gcs(1, 3, 5), + gcs(2, 4, 6), + }) + ); + } + catch (IOException ex) { + Assertions.fail(ex); + } + + // Read the journal back + + try { + var reader = new IndexJournalReaderSingleFile(tempFile); + + Iterator iter; + IndexJournalEntryTermData termData; + + try (var ptr = reader.newPointer()) { + + /** DOCUMENT 1 */ + assertTrue(ptr.nextDocument()); + assertEquals(11, ptr.documentId()); + assertEquals(22, ptr.documentFeatures()); + assertEquals(33, ptr.documentMeta()); + + iter = ptr.iterator(); + // Term 1 + assertTrue(iter.hasNext()); + termData = iter.next(); + assertEquals(wordId("word1"), termData.termId()); + assertEquals(44, termData.metadata()); + assertEquals(IntList.of(1, 3, 5), termData.positions().values()); + + // Ensure we can iterate again over the same document without persisting state or closing the pointer + + iter = ptr.iterator(); + // Term 1 + assertTrue(iter.hasNext()); + termData = iter.next(); + assertEquals(wordId("word1"), termData.termId()); + assertEquals(44, termData.metadata()); + assertEquals(IntList.of(1, 3, 5), termData.positions().values()); + } + } + catch (IOException ex) { + Assertions.fail(ex); + } + } + + @Test + public void testFiltered() { + try (var writer = new IndexJournalWriterSingleFileImpl(tempFile)) { + // Write two documents with two terms each + writer.put(new IndexJournalEntryHeader(11, 22, 33), + new IndexJournalEntryData( + new String[]{"word1", "word2"}, + new long[]{44, 55}, + new GammaCodedSequence[]{ + gcs(1, 3, 5), + gcs(2, 4, 6), + }) + ); + writer.put(new IndexJournalEntryHeader(12, 23, 34), + new IndexJournalEntryData( + new String[]{"word1", "word2"}, + new long[]{45, 56}, + new GammaCodedSequence[]{ + gcs(2, 4, 6), + gcs(3, 5, 7), + } + )); + } + catch (IOException ex) { + Assertions.fail(ex); + } + + // Read the journal back + + try { + var reader = new IndexJournalReaderSingleFile(tempFile).filtering(meta -> meta == 45); + + Iterator iter; + IndexJournalEntryTermData termData; + + try (var ptr = reader.newPointer()) { + /** DOCUMENT 2 */ + assertTrue(ptr.nextDocument()); + assertEquals(12, ptr.documentId()); + assertEquals(23, ptr.documentFeatures()); + assertEquals(34, ptr.documentMeta()); + + iter = ptr.iterator(); + // Term 1 + assertTrue(iter.hasNext()); + termData = iter.next(); + assertEquals(wordId("word1"), termData.termId()); + assertEquals(45, termData.metadata()); + assertEquals(IntList.of(2, 4, 6), termData.positions().values()); + + // No more terms + assertFalse(iter.hasNext()); + // No more documents + assertFalse(ptr.nextDocument()); + } + } + catch (IOException ex) { + Assertions.fail(ex); + } + } + +} diff --git a/code/index/index-journal/test/nu/marginalia/index/journal/reader/pointer/IndexJournalPointerTest.java b/code/index/index-journal/test/nu/marginalia/index/journal/reader/pointer/IndexJournalPointerTest.java index 202a229c..fe468a87 100644 --- a/code/index/index-journal/test/nu/marginalia/index/journal/reader/pointer/IndexJournalPointerTest.java +++ b/code/index/index-journal/test/nu/marginalia/index/journal/reader/pointer/IndexJournalPointerTest.java @@ -9,125 +9,125 @@ import java.util.ArrayList; import static org.junit.jupiter.api.Assertions.assertEquals; class IndexJournalPointerTest { - - @Test - public void concatenate() { - MockPointer left = new MockPointer( - List.of(new MockDocument(1, 2, 3, List.of( - new MockRecord(4, 5), - new MockRecord(6, 7)) - )) - ); - - MockPointer right = new MockPointer( - List.of(new MockDocument(8, 9, 10, List.of( - new MockRecord(11, 12), - new MockRecord(13, 14)) - )) - ); - - IndexJournalPointer concatenated = IndexJournalPointer.concatenate(left, right); - List docIdsSeq = new ArrayList<>(); - List wordIdsSeq = new ArrayList<>(); - while (concatenated.nextDocument()) { - docIdsSeq.add(concatenated.documentId()); - while (concatenated.nextRecord()) { - wordIdsSeq.add(concatenated.wordId()); - } - } - - assertEquals(docIdsSeq, List.of(1L, 8L)); - assertEquals(wordIdsSeq, List.of(4L, 6L, 11L, 13L)); - } - - @Test - public void filter() { - MockPointer left = new MockPointer( - List.of(new MockDocument(1, 2, 3, List.of( - new MockRecord(1, 1), - new MockRecord(2, 2), - new MockRecord(3, 3), - new MockRecord(4, 4), - new MockRecord(5, 5) - ) - ), new MockDocument(2, 2, 3, List.of( - new MockRecord(1, 1), - new MockRecord(3, 3), - new MockRecord(5, 5) - ) - )) - - ); - var filtered = left.filterWordMeta(meta -> (meta % 2) == 0); - - List docIdsSeq = new ArrayList<>(); - List wordIdsSeq = new ArrayList<>(); - while (filtered.nextDocument()) { - docIdsSeq.add(filtered.documentId()); - while (filtered.nextRecord()) { - wordIdsSeq.add(filtered.wordId()); - } - } - - assertEquals(docIdsSeq, List.of(1L, 2L)); - assertEquals(wordIdsSeq, List.of(2L, 4L)); - } - - class MockPointer implements IndexJournalPointer { - private final List documents; - - int di = -1; - int ri; - - public MockPointer(Collection documents) { - this.documents = new ArrayList<>(documents); - } - - @Override - public boolean nextDocument() { - if (++di < documents.size()) { - ri = -1; - return true; - } - - return false; - } - - @Override - public boolean nextRecord() { - if (++ri < documents.get(di).records.size()) { - return true; - } - - return false; - } - - @Override - public long documentId() { - return documents.get(di).docId; - } - - @Override - public long documentMeta() { - return documents.get(di).docMeta; - } - - @Override - public long wordId() { - return documents.get(di).records.get(ri).wordId; - } - - @Override - public long wordMeta() { - return documents.get(di).records.get(ri).wordMeta; - } - - @Override - public int documentFeatures() { - return documents.get(di).docFeatures; - } - } - - record MockDocument(long docId, long docMeta, int docFeatures, List records) {} - record MockRecord(long wordId, long wordMeta) {} +// +// @Test +// public void concatenate() { +// MockPointer left = new MockPointer( +// List.of(new MockDocument(1, 2, 3, List.of( +// new MockRecord(4, 5), +// new MockRecord(6, 7)) +// )) +// ); +// +// MockPointer right = new MockPointer( +// List.of(new MockDocument(8, 9, 10, List.of( +// new MockRecord(11, 12), +// new MockRecord(13, 14)) +// )) +// ); +// +// IndexJournalPointer concatenated = IndexJournalPointer.concatenate(left, right); +// List docIdsSeq = new ArrayList<>(); +// List wordIdsSeq = new ArrayList<>(); +// while (concatenated.nextDocument()) { +// docIdsSeq.add(concatenated.documentId()); +// while (concatenated.nextRecord()) { +// wordIdsSeq.add(concatenated.termId()); +// } +// } +// +// assertEquals(docIdsSeq, List.of(1L, 8L)); +// assertEquals(wordIdsSeq, List.of(4L, 6L, 11L, 13L)); +// } +// +// @Test +// public void filter() { +// MockPointer left = new MockPointer( +// List.of(new MockDocument(1, 2, 3, List.of( +// new MockRecord(1, 1), +// new MockRecord(2, 2), +// new MockRecord(3, 3), +// new MockRecord(4, 4), +// new MockRecord(5, 5) +// ) +// ), new MockDocument(2, 2, 3, List.of( +// new MockRecord(1, 1), +// new MockRecord(3, 3), +// new MockRecord(5, 5) +// ) +// )) +// +// ); +// var filtered = left.filterWordMeta(meta -> (meta % 2) == 0); +// +// List docIdsSeq = new ArrayList<>(); +// List wordIdsSeq = new ArrayList<>(); +// while (filtered.nextDocument()) { +// docIdsSeq.add(filtered.documentId()); +// while (filtered.nextRecord()) { +// wordIdsSeq.add(filtered.termId()); +// } +// } +// +// assertEquals(docIdsSeq, List.of(1L, 2L)); +// assertEquals(wordIdsSeq, List.of(2L, 4L)); +// } +// +// class MockPointer implements IndexJournalPointer { +// private final List documents; +// +// int di = -1; +// int ri; +// +// public MockPointer(Collection documents) { +// this.documents = new ArrayList<>(documents); +// } +// +// @Override +// public boolean nextDocument() { +// if (++di < documents.size()) { +// ri = -1; +// return true; +// } +// +// return false; +// } +// +// @Override +// public boolean nextRecord() { +// if (++ri < documents.get(di).records.size()) { +// return true; +// } +// +// return false; +// } +// +// @Override +// public long documentId() { +// return documents.get(di).docId; +// } +// +// @Override +// public long documentMeta() { +// return documents.get(di).docMeta; +// } +// +// @Override +// public long termId() { +// return documents.get(di).records.get(ri).termId; +// } +// +// @Override +// public long wordMeta() { +// return documents.get(di).records.get(ri).wordMeta; +// } +// +// @Override +// public int documentFeatures() { +// return documents.get(di).docFeatures; +// } +// } +// +// record MockDocument(long docId, long docMeta, int docFeatures, List records) {} +// record MockRecord(long termId, long wordMeta) {} } \ No newline at end of file diff --git a/code/index/index-reverse/build.gradle b/code/index/index-reverse/build.gradle index bd46b3a0..1ba91c19 100644 --- a/code/index/index-reverse/build.gradle +++ b/code/index/index-reverse/build.gradle @@ -16,12 +16,16 @@ apply from: "$rootProject.projectDir/srcsets.gradle" dependencies { implementation project(':code:libraries:array') implementation project(':code:libraries:btree') + implementation project(':code:libraries:coded-sequence') implementation project(':code:libraries:random-write-funnel') implementation project(':code:index:query') implementation project(':code:index:index-journal') implementation project(':code:common:model') implementation project(':code:common:process') + implementation project(':third-party:parquet-floor') + implementation project(':third-party:commons-codec') + implementation libs.bundles.slf4j diff --git a/code/index/index-reverse/java/nu/marginalia/index/construction/PositionsFileConstructor.java b/code/index/index-reverse/java/nu/marginalia/index/construction/PositionsFileConstructor.java new file mode 100644 index 00000000..180976e1 --- /dev/null +++ b/code/index/index-reverse/java/nu/marginalia/index/construction/PositionsFileConstructor.java @@ -0,0 +1,51 @@ +package nu.marginalia.index.construction; + +import nu.marginalia.sequence.GammaCodedSequence; + +import java.io.IOException; +import java.nio.ByteBuffer; +import java.nio.channels.FileChannel; +import java.nio.file.Path; +import java.nio.file.StandardOpenOption; + +public class PositionsFileConstructor implements AutoCloseable { + private final Path file; + private final FileChannel channel; + + private long offset; + private final ByteBuffer workBuffer = ByteBuffer.allocate(8192); + + public PositionsFileConstructor(Path file) throws IOException { + this.file = file; + + channel = FileChannel.open(file, StandardOpenOption.CREATE, StandardOpenOption.WRITE); + } + + /** Add a term to the positions file + * @param termMeta the term metadata + * @param positions the positions of the term + * @return the offset of the term in the file + */ + public long add(byte termMeta, GammaCodedSequence positions) throws IOException { + synchronized (file) { + var positionBuffer = positions.buffer(); + int size = 1 + positionBuffer.remaining(); + + if (workBuffer.remaining() < size) { + workBuffer.flip(); + channel.write(workBuffer); + workBuffer.clear(); + } + workBuffer.put(termMeta); + workBuffer.put(positionBuffer); + + offset += size; + return offset; + } + } + + public void close() throws IOException { + channel.force(false); + channel.close(); + } +} diff --git a/code/index/index-reverse/java/nu/marginalia/index/construction/ReverseIndexConstructor.java b/code/index/index-reverse/java/nu/marginalia/index/construction/ReverseIndexConstructor.java index 7a925679..d7227758 100644 --- a/code/index/index-reverse/java/nu/marginalia/index/construction/ReverseIndexConstructor.java +++ b/code/index/index-reverse/java/nu/marginalia/index/construction/ReverseIndexConstructor.java @@ -7,6 +7,7 @@ import org.slf4j.Logger; import org.slf4j.LoggerFactory; import java.io.IOException; +import java.nio.file.Files; import java.nio.file.Path; import java.util.concurrent.atomic.AtomicInteger; @@ -48,18 +49,22 @@ public class ReverseIndexConstructor { return; } + Path positionsFile = tmpDir.resolve("positions.dat"); + Files.deleteIfExists(positionsFile); try (var heartbeat = processHeartbeat.createProcessTaskHeartbeat(CreateReverseIndexSteps.class, processName)) { heartbeat.progress(CreateReverseIndexSteps.CONSTRUCT); - try (var preindexHeartbeat = processHeartbeat.createAdHocTaskHeartbeat("constructPreindexes")) { + try (var preindexHeartbeat = processHeartbeat.createAdHocTaskHeartbeat("constructPreindexes"); + PositionsFileConstructor posConstructor = new PositionsFileConstructor(positionsFile); + ) { AtomicInteger progress = new AtomicInteger(0); inputs .parallelStream() .map(in -> { preindexHeartbeat.progress("PREINDEX/MERGE", progress.incrementAndGet(), inputs.size()); - return construct(in); + return construct(in, posConstructor); }) .reduce(this::merge) .ifPresent((index) -> { @@ -73,9 +78,9 @@ public class ReverseIndexConstructor { } @SneakyThrows - private ReversePreindexReference construct(Path input) { + private ReversePreindexReference construct(Path input, PositionsFileConstructor positionsFileConstructor) { return ReversePreindex - .constructPreindex(readerSource.construct(input), docIdRewriter, tmpDir) + .constructPreindex(readerSource.construct(input), positionsFileConstructor, docIdRewriter, tmpDir) .closeToReference(); } diff --git a/code/index/index-reverse/java/nu/marginalia/index/construction/ReversePreindex.java b/code/index/index-reverse/java/nu/marginalia/index/construction/ReversePreindex.java index ac39e817..3abe8171 100644 --- a/code/index/index-reverse/java/nu/marginalia/index/construction/ReversePreindex.java +++ b/code/index/index-reverse/java/nu/marginalia/index/construction/ReversePreindex.java @@ -40,6 +40,7 @@ public class ReversePreindex { * will have randomly assigned names. */ public static ReversePreindex constructPreindex(IndexJournalReader reader, + PositionsFileConstructor positionsFileConstructor, DocIdRewriter docIdRewriter, Path workDir) throws IOException { @@ -48,7 +49,7 @@ public class ReversePreindex { Path docsFile = Files.createTempFile(workDir, "docs", ".dat"); var segments = ReversePreindexWordSegments.construct(reader, segmentWordsFile, segmentCountsFile); - var docs = ReversePreindexDocuments.construct(docsFile, workDir, reader, docIdRewriter, segments); + var docs = ReversePreindexDocuments.construct(docsFile, workDir, reader, docIdRewriter, positionsFileConstructor, segments); return new ReversePreindex(segments, docs); } diff --git a/code/index/index-reverse/java/nu/marginalia/index/construction/ReversePreindexDocuments.java b/code/index/index-reverse/java/nu/marginalia/index/construction/ReversePreindexDocuments.java index 0f232577..aa4fc98e 100644 --- a/code/index/index-reverse/java/nu/marginalia/index/construction/ReversePreindexDocuments.java +++ b/code/index/index-reverse/java/nu/marginalia/index/construction/ReversePreindexDocuments.java @@ -21,6 +21,7 @@ import java.util.concurrent.TimeUnit; * the associated ReversePreindexWordSegments data */ public class ReversePreindexDocuments { + private static PositionsFileConstructor positionsFileConstructor; final Path file; public final LongArray documents; private static final int RECORD_SIZE_LONGS = 2; @@ -36,7 +37,9 @@ public class ReversePreindexDocuments { Path workDir, IndexJournalReader reader, DocIdRewriter docIdRewriter, + PositionsFileConstructor positionsFileConstructor, ReversePreindexWordSegments segments) throws IOException { + ReversePreindexDocuments.positionsFileConstructor = positionsFileConstructor; createUnsortedDocsFile(docsFile, workDir, reader, segments, docIdRewriter); @@ -75,14 +78,14 @@ public class ReversePreindexDocuments { var pointer = reader.newPointer(); while (pointer.nextDocument()) { long rankEncodedId = docIdRewriter.rewriteDocId(pointer.documentId()); - while (pointer.nextRecord()) { - long wordId = pointer.wordId(); - long wordMeta = pointer.wordMeta(); + for (var termData : pointer) { + long termId = termData.termId(); - long offset = offsetMap.addTo(wordId, RECORD_SIZE_LONGS); + long offset = offsetMap.addTo(termId, RECORD_SIZE_LONGS); + long posOffset = positionsFileConstructor.add((byte) termData.metadata(), termData.positions()); assembly.put(offset + 0, rankEncodedId); - assembly.put(offset + 1, wordMeta); + assembly.put(offset + 1, posOffset); } } diff --git a/code/index/index-reverse/java/nu/marginalia/index/construction/ReversePreindexWordSegments.java b/code/index/index-reverse/java/nu/marginalia/index/construction/ReversePreindexWordSegments.java index 0e6c32fb..0351ed45 100644 --- a/code/index/index-reverse/java/nu/marginalia/index/construction/ReversePreindexWordSegments.java +++ b/code/index/index-reverse/java/nu/marginalia/index/construction/ReversePreindexWordSegments.java @@ -12,7 +12,7 @@ import java.nio.file.Files; import java.nio.file.Path; /** A pair of file-backed arrays of sorted wordIds - * and the count of documents associated with each wordId. + * and the count of documents associated with each termId. */ public class ReversePreindexWordSegments { public final LongArray wordIds; @@ -34,7 +34,7 @@ public class ReversePreindexWordSegments { this.countsFile = countsFile; } - /** Returns a long-long hash map where each key is a wordId, + /** Returns a long-long hash map where each key is a termId, * and each value is the start offset of the data. */ public Long2LongOpenHashMap asMap(int recordSize) { @@ -188,7 +188,7 @@ public class ReversePreindexWordSegments { if (i == fileSize) { // We've reached the end of the iteration and there is no - // "next" wordId to fetch + // "next" termId to fetch wordId = Long.MIN_VALUE; return false; } diff --git a/code/index/index-reverse/test/nu/marginalia/index/ReverseIndexReaderTest.java b/code/index/index-reverse/test/nu/marginalia/index/ReverseIndexReaderTest.java index 265864c4..981136ad 100644 --- a/code/index/index-reverse/test/nu/marginalia/index/ReverseIndexReaderTest.java +++ b/code/index/index-reverse/test/nu/marginalia/index/ReverseIndexReaderTest.java @@ -2,12 +2,14 @@ package nu.marginalia.index; import nu.marginalia.array.page.LongQueryBuffer; import nu.marginalia.index.construction.DocIdRewriter; +import nu.marginalia.index.construction.PositionsFileConstructor; import nu.marginalia.index.construction.ReversePreindex; import nu.marginalia.index.construction.TestJournalFactory; import nu.marginalia.index.construction.TestJournalFactory.EntryDataWithWordMeta; import org.junit.jupiter.api.AfterEach; import org.junit.jupiter.api.BeforeEach; import org.junit.jupiter.api.Test; +import org.mockito.Mockito; import java.io.IOException; import java.nio.file.Files; @@ -89,7 +91,9 @@ class ReverseIndexReaderTest { private ReverseIndexReader createIndex(EntryDataWithWordMeta... scenario) throws IOException { var reader = journalFactory.createReader(scenario); - var preindex = ReversePreindex.constructPreindex(reader, DocIdRewriter.identity(), tempDir); + var preindex = ReversePreindex.constructPreindex(reader, + Mockito.mock(PositionsFileConstructor.class), + DocIdRewriter.identity(), tempDir); Path docsFile = tempDir.resolve("docs.dat"); diff --git a/code/index/index-reverse/test/nu/marginalia/index/construction/ReversePreindexDocsTest.java b/code/index/index-reverse/test/nu/marginalia/index/construction/ReversePreindexDocsTest.java index d6d81818..ca3b49a3 100644 --- a/code/index/index-reverse/test/nu/marginalia/index/construction/ReversePreindexDocsTest.java +++ b/code/index/index-reverse/test/nu/marginalia/index/construction/ReversePreindexDocsTest.java @@ -100,6 +100,7 @@ class ReversePreindexDocsTest { assertEquals(expected, actual); } + @Test public void testDocs2() throws IOException { var reader = journalFactory.createReader( @@ -108,7 +109,7 @@ class ReversePreindexDocsTest { ); var segments = ReversePreindexWordSegments.construct(reader, wordsIdFile, countsFile); - var docs = ReversePreindexDocuments.construct(docsFile, tempDir, reader, DocIdRewriter.identity(), segments); + var docs = ReversePreindexDocuments.construct(docsFile, tempDir, reader, DocIdRewriter.identity(), segme.nts); List expected = List.of( new TestSegmentData(-100, 0, 4, new long[] { -0xF00BA3L, 0, 0xF00BA4L, 0 }), diff --git a/code/index/index-reverse/test/nu/marginalia/index/construction/TestJournalFactory.java b/code/index/index-reverse/test/nu/marginalia/index/construction/TestJournalFactory.java index b122921b..db262d9f 100644 --- a/code/index/index-reverse/test/nu/marginalia/index/construction/TestJournalFactory.java +++ b/code/index/index-reverse/test/nu/marginalia/index/construction/TestJournalFactory.java @@ -5,6 +5,7 @@ import nu.marginalia.index.journal.model.IndexJournalEntryHeader; import nu.marginalia.index.journal.reader.IndexJournalReader; import nu.marginalia.index.journal.reader.IndexJournalReaderSingleFile; import nu.marginalia.index.journal.writer.IndexJournalWriterSingleFileImpl; +import nu.marginalia.sequence.GammaCodedSequence; import java.io.IOException; import java.nio.file.Files; @@ -60,12 +61,18 @@ public class TestJournalFactory { var writer = new IndexJournalWriterSingleFileImpl(jf); for (var entry : entries) { - long[] data = new long[entry.wordIds.length * 2]; - for (int i = 0; i < entry.wordIds.length; i++) - data[i*2] = entry.wordIds[i]; + long[] termIds = new long[entry.wordIds.length]; + long[] meta = new long[entry.wordIds.length]; + + GammaCodedSequence[] positions = new GammaCodedSequence[entry.wordIds.length]; + for (int i = 0; i < entry.wordIds.length; i++) { + termIds[i] = entry.wordIds[i]; + meta[i] = 0; + positions[i] = new GammaCodedSequence(new byte[1]); + } writer.put(new IndexJournalEntryHeader(entries.length, 0, entry.docId, entry.docMeta), - new IndexJournalEntryData(data)); + new IndexJournalEntryData(termIds, meta, positions)); } writer.close(); var ret = new IndexJournalReaderSingleFile(jf); @@ -77,14 +84,18 @@ public class TestJournalFactory { var writer = new IndexJournalWriterSingleFileImpl(jf); for (var entry : entries) { - long[] data = new long[entry.wordIds.length * 2]; + + long[] termIds = new long[entry.wordIds.length]; + long[] meta = new long[entry.wordIds.length]; + GammaCodedSequence[] positions = new GammaCodedSequence[entry.wordIds.length]; for (int i = 0; i < entry.wordIds.length; i++) { - data[i * 2] = entry.wordIds[i].wordId; - data[i * 2 + 1] = entry.wordIds[i].meta; + termIds[i] = entry.wordIds[i].wordId; + meta[i] = entry.wordIds[i].meta; + positions[i] = new GammaCodedSequence(new byte[1]); } writer.put(new IndexJournalEntryHeader(entries.length, 0, entry.docId, entry.docMeta), - new IndexJournalEntryData(data)); + new IndexJournalEntryData(termIds, meta, positions)); } writer.close(); var ret = new IndexJournalReaderSingleFile(jf); diff --git a/code/index/test/nu/marginalia/index/IndexQueryServiceIntegrationSmokeTest.java b/code/index/test/nu/marginalia/index/IndexQueryServiceIntegrationSmokeTest.java index 7b0a6a24..4a976265 100644 --- a/code/index/test/nu/marginalia/index/IndexQueryServiceIntegrationSmokeTest.java +++ b/code/index/test/nu/marginalia/index/IndexQueryServiceIntegrationSmokeTest.java @@ -8,15 +8,16 @@ import nu.marginalia.api.searchquery.model.query.SearchSpecification; import nu.marginalia.api.searchquery.model.query.SearchQuery; import nu.marginalia.api.searchquery.model.results.ResultRankingParameters; import nu.marginalia.index.index.StatefulIndex; +import nu.marginalia.index.journal.model.IndexJournalEntryData; import nu.marginalia.process.control.FakeProcessHeartbeat; import nu.marginalia.process.control.ProcessHeartbeat; +import nu.marginalia.sequence.GammaCodedSequence; import nu.marginalia.storage.FileStorageService; import nu.marginalia.hash.MurmurHash3_128; import nu.marginalia.index.construction.DocIdRewriter; import nu.marginalia.index.construction.ReverseIndexConstructor; import nu.marginalia.index.forward.ForwardIndexConverter; import nu.marginalia.index.forward.ForwardIndexFileNames; -import nu.marginalia.index.journal.model.IndexJournalEntryData; import nu.marginalia.index.journal.model.IndexJournalEntryHeader; import nu.marginalia.index.journal.reader.IndexJournalReader; import nu.marginalia.index.journal.writer.IndexJournalWriter; @@ -41,6 +42,7 @@ import org.junit.jupiter.api.Test; import org.junit.jupiter.api.parallel.Execution; import java.io.IOException; +import java.nio.ByteBuffer; import java.nio.file.Files; import java.nio.file.Path; import java.sql.SQLException; @@ -300,7 +302,18 @@ public class IndexQueryServiceIntegrationSmokeTest { "test", "test", 0., "HTML5", 0, null, 0, 10 )); - indexJournalWriter.put(header, new IndexJournalEntryData(data)); + String[] keywords = IntStream.range(0, factors.length).mapToObj(Integer::toString).toArray(String[]::new); + long[] metadata = new long[factors.length]; + for (int i = 0; i < factors.length; i++) { + metadata[i] = new WordMetadata(i, EnumSet.of(WordFlags.Title)).encode(); + } + GammaCodedSequence[] positions = new GammaCodedSequence[factors.length]; + ByteBuffer wa = ByteBuffer.allocate(16); + for (int i = 0; i < factors.length; i++) { + positions[i] = GammaCodedSequence.generate(wa, i + 1); + } + + indexJournalWriter.put(header, new IndexJournalEntryData(keywords, metadata, positions)); } @SneakyThrows @@ -309,19 +322,24 @@ public class IndexQueryServiceIntegrationSmokeTest { long fullId = UrlIdCodec.encodeId(domain, id); var header = new IndexJournalEntryHeader(factors.length, 0, fullId, DocumentMetadata.defaultValue()); - long[] data = new long[factors.length*2]; - for (int i = 0; i < factors.length; i++) { - data[2*i] = hasher.hashNearlyASCII(Integer.toString(factors[i])); - data[2*i + 1] = new WordMetadata(i, EnumSet.of(WordFlags.Title)).encode(); - } - ldbw.add(new DocdbUrlDetail( fullId, new EdgeUrl("https://www.example.com/"+id), "test", "test", 0., "HTML5", 0, null, 0, 10 )); - indexJournalWriter.put(header, new IndexJournalEntryData(data)); + String[] keywords = IntStream.range(0, factors.length).mapToObj(Integer::toString).toArray(String[]::new); + long[] metadata = new long[factors.length]; + for (int i = 0; i < factors.length; i++) { + metadata[i] = new WordMetadata(i, EnumSet.of(WordFlags.Title)).encode(); + } + GammaCodedSequence[] positions = new GammaCodedSequence[factors.length]; + ByteBuffer wa = ByteBuffer.allocate(16); + for (int i = 0; i < factors.length; i++) { + positions[i] = GammaCodedSequence.generate(wa, i); + } + + indexJournalWriter.put(header, new IndexJournalEntryData(keywords, metadata, positions)); } } diff --git a/code/index/test/nu/marginalia/index/IndexQueryServiceIntegrationTest.java b/code/index/test/nu/marginalia/index/IndexQueryServiceIntegrationTest.java index e29f8751..861923dd 100644 --- a/code/index/test/nu/marginalia/index/IndexQueryServiceIntegrationTest.java +++ b/code/index/test/nu/marginalia/index/IndexQueryServiceIntegrationTest.java @@ -7,13 +7,14 @@ import nu.marginalia.api.searchquery.model.query.SearchSpecification; import nu.marginalia.api.searchquery.model.query.SearchQuery; import nu.marginalia.api.searchquery.model.results.ResultRankingParameters; import nu.marginalia.index.index.StatefulIndex; +import nu.marginalia.index.journal.model.IndexJournalEntryData; +import nu.marginalia.sequence.GammaCodedSequence; import nu.marginalia.storage.FileStorageService; import nu.marginalia.hash.MurmurHash3_128; import nu.marginalia.index.construction.DocIdRewriter; import nu.marginalia.index.construction.ReverseIndexConstructor; import nu.marginalia.index.forward.ForwardIndexConverter; import nu.marginalia.index.forward.ForwardIndexFileNames; -import nu.marginalia.index.journal.model.IndexJournalEntryData; import nu.marginalia.index.journal.model.IndexJournalEntryHeader; import nu.marginalia.index.journal.reader.IndexJournalReader; import nu.marginalia.index.journal.writer.IndexJournalWriter; @@ -44,6 +45,7 @@ import org.junit.jupiter.api.parallel.Execution; import javax.annotation.CheckReturnValue; import java.io.IOException; import java.net.URISyntaxException; +import java.nio.ByteBuffer; import java.nio.file.Files; import java.nio.file.Path; import java.sql.SQLException; @@ -549,13 +551,13 @@ public class IndexQueryServiceIntegrationTest { meta.documentMetadata.encode() ); - long[] dataArray = new long[words.size() * 2]; - for (int i = 0; i < words.size(); i++) { - dataArray[2*i] = hasher.hashNearlyASCII(words.get(i).keyword); - dataArray[2*i+1] = words.get(i).termMetadata; - } - var entry = new IndexJournalEntryData(dataArray); - indexJournalWriter.put(header, entry); + String[] keywords = words.stream().map(w -> w.keyword).toArray(String[]::new); + long[] metadata = words.stream().map(w -> w.termMetadata).mapToLong(Long::longValue).toArray(); + GammaCodedSequence[] positions = new GammaCodedSequence[words.size()]; // FIXME: positions? + Arrays.setAll(positions, i -> new GammaCodedSequence(ByteBuffer.allocate(1))); + + indexJournalWriter.put(header, + new IndexJournalEntryData(keywords, metadata, positions)); }); var linkdbWriter = new DocumentDbWriter( diff --git a/code/libraries/coded-sequence/java/nu/marginalia/sequence/EliasGammaCodec.java b/code/libraries/coded-sequence/java/nu/marginalia/sequence/EliasGammaCodec.java index 7ee85495..335d57d8 100644 --- a/code/libraries/coded-sequence/java/nu/marginalia/sequence/EliasGammaCodec.java +++ b/code/libraries/coded-sequence/java/nu/marginalia/sequence/EliasGammaCodec.java @@ -7,18 +7,30 @@ import nu.marginalia.sequence.io.BitWriter; import java.nio.ByteBuffer; -/** Implement coding and decoding of sequences of integers using the Elias Gamma code - * - * https://en.wikipedia.org/wiki/Elias_gamma_coding +/** Implement coding and decoding of sequences of integers using the Elias Gamma code. + * The sequence is prefixed by the number of integers in the sequence, then the delta between + * each integer in the sequence is encoded using the Elias Gamma code. + *

+ * https://en.wikipedia.org/wiki/Elias_gamma_coding * */ public class EliasGammaCodec implements IntIterator { private final BitReader reader; + int rem = 0; private int last = 0; private int next = 0; private EliasGammaCodec(ByteBuffer buffer) { reader = new BitReader(buffer); + + int bits = reader.takeWhileZero(); + + if (!reader.hasMore()) { + rem = 0; + } + else { + rem = reader.get(bits); + } } /** Decode a sequence of integers from a ByteBuffer using the Elias Gamma code */ @@ -31,7 +43,13 @@ public class EliasGammaCodec implements IntIterator { * or equal to zero. */ public static ByteBuffer encode(ByteBuffer workArea, IntList sequence) { + if (sequence.isEmpty()) + return ByteBuffer.allocate(0); + var writer = new BitWriter(workArea); + + writer.putGammaCoded(sequence.size()); + int last = 0; for (var iter = sequence.iterator(); iter.hasNext(); ) { @@ -42,9 +60,7 @@ public class EliasGammaCodec implements IntIterator { // can't encode zeroes assert delta > 0 : "Sequence must be strictly increasing and may not contain zeroes or negative values"; - int bits = Integer.numberOfTrailingZeros(Integer.highestOneBit(delta)); - writer.put(0, bits + 1); - writer.put(delta, bits + 1); + writer.putGammaCoded(delta); } return writer.finish(); @@ -60,16 +76,13 @@ public class EliasGammaCodec implements IntIterator { @Override public boolean hasNext() { - if (next > 0) - return true; - if (!reader.hasMore()) - return false; + if (next > 0) return true; + if (!reader.hasMore() || --rem < 0) return false; int bits = reader.takeWhileZero(); - if (!reader.hasMore()) { - return false; - } + if (!reader.hasMore()) return false; + int delta = reader.get(bits); last += delta; next = last; diff --git a/code/libraries/coded-sequence/java/nu/marginalia/sequence/GammaCodedSequence.java b/code/libraries/coded-sequence/java/nu/marginalia/sequence/GammaCodedSequence.java index 2207921d..58ff30d2 100644 --- a/code/libraries/coded-sequence/java/nu/marginalia/sequence/GammaCodedSequence.java +++ b/code/libraries/coded-sequence/java/nu/marginalia/sequence/GammaCodedSequence.java @@ -16,6 +16,8 @@ import java.util.StringJoiner; * */ public class GammaCodedSequence implements BinarySerializable, Iterable { private final ByteBuffer raw; + int startPos = 0; + int startLimit = 0; /** Create a new GammaCodedSequence from a sequence of integers. * @@ -37,12 +39,16 @@ public class GammaCodedSequence implements BinarySerializable, Iterable public GammaCodedSequence(ByteBuffer bytes) { this.raw = bytes; + startPos = bytes.position(); + startLimit = bytes.limit(); } public GammaCodedSequence(byte[] bytes) { raw = ByteBuffer.allocate(bytes.length); raw.put(bytes); raw.clear(); + startPos = 0; + startLimit = bytes.length; } /** Return the raw bytes of the sequence. */ @@ -52,21 +58,29 @@ public class GammaCodedSequence implements BinarySerializable, Iterable return raw.array(); } else { - raw.clear(); - byte[] bytes = new byte[raw.capacity()]; - raw.get(bytes, 0, bytes.length); + raw.get(0, bytes, 0, bytes.length); return bytes; } } @Override public IntIterator iterator() { - raw.clear(); + raw.position(startPos); + raw.limit(startLimit); return EliasGammaCodec.decode(raw); } + public IntList values() { + var intItr = iterator(); + IntArrayList ret = new IntArrayList(8); + while (intItr.hasNext()) { + ret.add(intItr.nextInt()); + } + return ret; + } + /** Decode the sequence into an IntList; * this is a somewhat slow operation, * iterating over the data directly more performant */ @@ -94,4 +108,15 @@ public class GammaCodedSequence implements BinarySerializable, Iterable } return sj.toString(); } + + public ByteBuffer buffer() { + raw.position(startPos); + raw.limit(startLimit); + + return raw; + } + + public int size() { + return raw.capacity(); + } } diff --git a/code/libraries/coded-sequence/java/nu/marginalia/sequence/io/BitReader.java b/code/libraries/coded-sequence/java/nu/marginalia/sequence/io/BitReader.java index 2d7d79db..08979f0d 100644 --- a/code/libraries/coded-sequence/java/nu/marginalia/sequence/io/BitReader.java +++ b/code/libraries/coded-sequence/java/nu/marginalia/sequence/io/BitReader.java @@ -78,7 +78,7 @@ public class BitReader { int result = 0; - for (;;) { + do { // Ensure we have bits to read if (bitPosition <= 0) { if (underlying.hasRemaining()) @@ -96,10 +96,8 @@ public class BitReader { // Subtract the number of bits read from the current position bitPosition -= zeroes; - // If bitPosition isn't zero, we've found a 1 and can stop - if (bitPosition > 0) - break; - } + // If bit position is not positive, we've found a 1 and can stop + } while (bitPosition <= 0); return result; } diff --git a/code/libraries/coded-sequence/java/nu/marginalia/sequence/io/BitWriter.java b/code/libraries/coded-sequence/java/nu/marginalia/sequence/io/BitWriter.java index 92f6abc6..e5636064 100644 --- a/code/libraries/coded-sequence/java/nu/marginalia/sequence/io/BitWriter.java +++ b/code/libraries/coded-sequence/java/nu/marginalia/sequence/io/BitWriter.java @@ -72,6 +72,17 @@ public class BitWriter { } } + /** Write the provided value in a gamma-coded format, + * e.g. by first finding the number of significant bits, + * then writing that many zeroes, then the bits themselves + */ + public void putGammaCoded(int value) { + int bits = 1 + Integer.numberOfTrailingZeros(Integer.highestOneBit(value)); + + put(0, bits); + put(value, bits); + } + public ByteBuffer finish() { finishLastByte(); diff --git a/code/libraries/coded-sequence/test/nu/marginalia/sequence/BitReaderTest.java b/code/libraries/coded-sequence/test/nu/marginalia/sequence/BitReaderTest.java index 579653a2..0c6e0e8b 100644 --- a/code/libraries/coded-sequence/test/nu/marginalia/sequence/BitReaderTest.java +++ b/code/libraries/coded-sequence/test/nu/marginalia/sequence/BitReaderTest.java @@ -115,16 +115,17 @@ class BitReaderTest { } @Test - public void testTakeWhileZeroOverInt32() { + public void testTakeWhileZeroOverInt64() { var writer = new BitWriter(ByteBuffer.allocate(1024)); writer.put(0, 32); + writer.put(0, 32); writer.put(0, 2); writer.putBit(true); var buffer = writer.finish(); var reader = new BitReader(buffer); int val = reader.takeWhileZero(); - assertEquals(34, val); + assertEquals(66, val); assertTrue(reader.getBit()); } } \ No newline at end of file diff --git a/code/processes/loading-process/java/nu/marginalia/loading/LoaderIndexJournalWriter.java b/code/processes/loading-process/java/nu/marginalia/loading/LoaderIndexJournalWriter.java index 2dee50fa..9c87bab7 100644 --- a/code/processes/loading-process/java/nu/marginalia/loading/LoaderIndexJournalWriter.java +++ b/code/processes/loading-process/java/nu/marginalia/loading/LoaderIndexJournalWriter.java @@ -4,9 +4,9 @@ import com.google.inject.Inject; import com.google.inject.Singleton; import lombok.SneakyThrows; import nu.marginalia.IndexLocations; +import nu.marginalia.index.journal.model.IndexJournalEntryData; import nu.marginalia.storage.FileStorageService; import nu.marginalia.hash.MurmurHash3_128; -import nu.marginalia.index.journal.model.IndexJournalEntryData; import nu.marginalia.index.journal.model.IndexJournalEntryHeader; import nu.marginalia.index.journal.writer.IndexJournalWriterPagingImpl; import nu.marginalia.index.journal.writer.IndexJournalWriter; @@ -18,9 +18,7 @@ import org.slf4j.LoggerFactory; import java.io.IOException; import java.nio.file.Files; -import java.sql.SQLException; -import static nu.marginalia.index.journal.model.IndexJournalEntryData.MAX_LENGTH; @Singleton public class LoaderIndexJournalWriter { @@ -28,12 +26,11 @@ public class LoaderIndexJournalWriter { private final IndexJournalWriter indexWriter; private static final Logger logger = LoggerFactory.getLogger(LoaderIndexJournalWriter.class); - private final MurmurHash3_128 hasher = new MurmurHash3_128(); - private final long[] buffer = new long[MAX_LENGTH * 2]; + private final long[] buffer = new long[65536]; @Inject - public LoaderIndexJournalWriter(FileStorageService fileStorageService) throws IOException, SQLException { + public LoaderIndexJournalWriter(FileStorageService fileStorageService) throws IOException { var indexArea = IndexLocations.getIndexConstructionArea(fileStorageService); var existingIndexFiles = IndexJournalFileNames.findJournalFiles(indexArea); @@ -68,26 +65,10 @@ public class LoaderIndexJournalWriter { return; } - var pointer = wordSet.newPointer(); - - while (pointer.hasMore()) { - int i = 0; - - while (i < buffer.length - && pointer.advancePointer()) - { - final long hashedKeyword = hasher.hashKeyword(pointer.getKeyword()); - - buffer[i++] = hashedKeyword; - buffer[i++] = pointer.getMetadata(); - } - - var entry = new IndexJournalEntryData(i, buffer); - var header = new IndexJournalEntryHeader(combinedId, features, metadata); - - indexWriter.put(header, entry); - } + var header = new IndexJournalEntryHeader(combinedId, features, metadata); + var data = new IndexJournalEntryData(wordSet.keywords, wordSet.metadata, wordSet.positions); + indexWriter.put(header, data); } public void close() throws Exception { diff --git a/code/processes/loading-process/test/nu/marginalia/loading/loader/LoaderIndexJournalWriterTest.java b/code/processes/loading-process/test/nu/marginalia/loading/loader/LoaderIndexJournalWriterTest.java deleted file mode 100644 index 0f1afebe..00000000 --- a/code/processes/loading-process/test/nu/marginalia/loading/loader/LoaderIndexJournalWriterTest.java +++ /dev/null @@ -1,87 +0,0 @@ -package nu.marginalia.loading.loader; - -import nu.marginalia.sequence.GammaCodedSequence; -import nu.marginalia.storage.FileStorageService; -import nu.marginalia.storage.model.FileStorageBase; -import nu.marginalia.storage.model.FileStorageBaseType; -import nu.marginalia.index.journal.reader.IndexJournalReaderSingleFile; -import nu.marginalia.keyword.model.DocumentKeywords; -import nu.marginalia.loading.LoaderIndexJournalWriter; -import nu.marginalia.model.idx.DocumentMetadata; -import nu.marginalia.index.journal.IndexJournalFileNames; -import org.junit.jupiter.api.AfterEach; -import org.junit.jupiter.api.BeforeEach; -import org.junit.jupiter.api.Test; -import org.mockito.Mockito; - -import java.io.IOException; -import java.nio.ByteBuffer; -import java.nio.file.Files; -import java.nio.file.Path; -import java.sql.SQLException; -import java.util.ArrayList; -import java.util.List; -import java.util.stream.LongStream; - -import static org.junit.jupiter.api.Assertions.*; - -class LoaderIndexJournalWriterTest { - - Path tempDir; - LoaderIndexJournalWriter writer; - @BeforeEach - public void setUp() throws IOException, SQLException { - tempDir = Files.createTempDirectory(getClass().getSimpleName()); - FileStorageService storageService = Mockito.mock(FileStorageService.class); - - Mockito.when(storageService.getStorageBase(FileStorageBaseType.CURRENT)).thenReturn(new FileStorageBase(null, null, 1,null, tempDir.toString())); - - writer = new LoaderIndexJournalWriter(storageService); - } - - @AfterEach - public void tearDown() throws Exception { - writer.close(); - List junk = Files.list(tempDir.resolve("iw")).toList(); - for (var item : junk) - Files.delete(item); - Files.delete(tempDir.resolve("iw")); - Files.delete(tempDir); - } - - @Test - public void testBreakup() throws Exception { - String[] keywords = new String[2000]; - long[] metadata = new long[2000]; - GammaCodedSequence[] positions = new GammaCodedSequence[2000]; - ByteBuffer workArea = ByteBuffer.allocate(1024); - for (int i = 0; i < 2000; i++) { - keywords[i] = Integer.toString(i); - metadata[i] = i+1; - positions[i] = GammaCodedSequence.generate(workArea, 1, 2, 3); - } - DocumentKeywords words = new DocumentKeywords(keywords, metadata, positions); - writer.putWords(1, 0, new DocumentMetadata(0), - words); - - writer.close(); - - List journalFiles = IndexJournalFileNames.findJournalFiles(tempDir.resolve("iw")); - assertEquals(1, journalFiles.size()); - - var reader = new IndexJournalReaderSingleFile(journalFiles.get(0)); - List docIds = new ArrayList<>(); - reader.forEachDocId(docIds::add); - assertEquals(List.of(1L, 1L), docIds); - - List metas = new ArrayList(); - var ptr = reader.newPointer(); - while (ptr.nextDocument()) { - while (ptr.nextRecord()) { - metas.add(ptr.wordMeta()); - } - } - - assertEquals(LongStream.of(metadata).boxed().toList(), metas); - } -} \ No newline at end of file diff --git a/code/services-application/search-service/java/nu/marginalia/search/SearchMain.java b/code/services-application/search-service/java/nu/marginalia/search/SearchMain.java index fe283471..37b9893d 100644 --- a/code/services-application/search-service/java/nu/marginalia/search/SearchMain.java +++ b/code/services-application/search-service/java/nu/marginalia/search/SearchMain.java @@ -33,6 +33,7 @@ public class SearchMain extends MainClass { new ServiceDiscoveryModule(), new DatabaseModule(false) ); + // Orchestrate the boot order for the services var registry = injector.getInstance(ServiceRegistryIf.class);