tPos = positions.getData(tempBuffer);
+
+ for (int i = 0; i < tIds.length; i++) {
+ long termId = tIds[i];
+ byte meta = tMeta[i];
+ ByteBuffer pos = tPos.get(i);
+
+ long offset = offsetMap.addTo(termId, RECORD_SIZE_LONGS);
+ long encodedPosOffset = positionsFileConstructor.add(meta, pos);
+
+ assembly.put(offset + 0, rankEncodedId);
+ assembly.put(offset + 1, encodedPosOffset);
+ }
+ }
+
+ assembly.write(docsFile);
+ }
+ }
+
+ @SneakyThrows
+ private static void sortDocsFile(LongArray docsFileMap, FullPreindexWordSegments segments) {
+
+ var iter = segments.iterator(RECORD_SIZE_LONGS);
+
+ while (iter.next()) {
+ long iterStart = iter.startOffset;
+ long iterEnd = iter.endOffset;
+
+ docsFileMap.quickSortN(RECORD_SIZE_LONGS, iterStart, iterEnd);
+ }
+ }
+
+ public void delete() throws IOException {
+ Files.delete(this.file);
+ documents.close();
+ }
+
+ public void close() {
+ documents.close();
+ }
+
+ public void force() {
+ documents.force();
+ }
+}
diff --git a/code/index/index-reverse/java/nu/marginalia/index/construction/ReversePreindexReference.java b/code/index/index-reverse/java/nu/marginalia/index/construction/full/FullPreindexReference.java
similarity index 62%
rename from code/index/index-reverse/java/nu/marginalia/index/construction/ReversePreindexReference.java
rename to code/index/index-reverse/java/nu/marginalia/index/construction/full/FullPreindexReference.java
index 16c542d5..73bd03b2 100644
--- a/code/index/index-reverse/java/nu/marginalia/index/construction/ReversePreindexReference.java
+++ b/code/index/index-reverse/java/nu/marginalia/index/construction/full/FullPreindexReference.java
@@ -1,33 +1,33 @@
-package nu.marginalia.index.construction;
+package nu.marginalia.index.construction.full;
import nu.marginalia.array.LongArrayFactory;
import java.io.IOException;
import java.nio.file.Path;
-/** This is a dehydrated version of a ReversePreIndex, that only
+/** This is a dehydrated page of a FullPreIndex, that only
* keeps references to its location on disk but does not hold associated
* memory maps.
*/
-public record ReversePreindexReference(
+public record FullPreindexReference(
Path wordsFile,
Path countsFile,
Path documentsFile
)
{
- public ReversePreindexReference(ReversePreindexWordSegments segments, ReversePreindexDocuments documents) {
+ public FullPreindexReference(FullPreindexWordSegments segments, FullPreindexDocuments documents) {
this(segments.wordsFile, segments.countsFile, documents.file);
}
- public ReversePreindex open() throws IOException {
- return new ReversePreindex(
- new ReversePreindexWordSegments(
+ public FullPreindex open() throws IOException {
+ return new FullPreindex(
+ new FullPreindexWordSegments(
LongArrayFactory.mmapForModifyingShared(wordsFile),
LongArrayFactory.mmapForModifyingShared(countsFile),
wordsFile,
countsFile
),
- new ReversePreindexDocuments(
+ new FullPreindexDocuments(
LongArrayFactory.mmapForModifyingShared(documentsFile),
documentsFile
)
diff --git a/code/index/index-reverse/java/nu/marginalia/index/construction/ReversePreindexWordSegments.java b/code/index/index-reverse/java/nu/marginalia/index/construction/full/FullPreindexWordSegments.java
similarity index 80%
rename from code/index/index-reverse/java/nu/marginalia/index/construction/ReversePreindexWordSegments.java
rename to code/index/index-reverse/java/nu/marginalia/index/construction/full/FullPreindexWordSegments.java
index 0e6c32fb..0a4e39a7 100644
--- a/code/index/index-reverse/java/nu/marginalia/index/construction/ReversePreindexWordSegments.java
+++ b/code/index/index-reverse/java/nu/marginalia/index/construction/full/FullPreindexWordSegments.java
@@ -1,30 +1,31 @@
-package nu.marginalia.index.construction;
+package nu.marginalia.index.construction.full;
import it.unimi.dsi.fastutil.longs.Long2IntOpenHashMap;
import it.unimi.dsi.fastutil.longs.Long2LongOpenHashMap;
import it.unimi.dsi.fastutil.longs.LongIterator;
import nu.marginalia.array.LongArray;
import nu.marginalia.array.LongArrayFactory;
-import nu.marginalia.index.journal.reader.IndexJournalReader;
+import nu.marginalia.index.journal.IndexJournalPage;
+import nu.marginalia.slop.SlopTable;
import java.io.IOException;
import java.nio.file.Files;
import java.nio.file.Path;
/** A pair of file-backed arrays of sorted wordIds
- * and the count of documents associated with each wordId.
+ * and the count of documents associated with each termId.
*/
-public class ReversePreindexWordSegments {
+public class FullPreindexWordSegments {
public final LongArray wordIds;
public final LongArray counts;
final Path wordsFile;
final Path countsFile;
- public ReversePreindexWordSegments(LongArray wordIds,
- LongArray counts,
- Path wordsFile,
- Path countsFile)
+ public FullPreindexWordSegments(LongArray wordIds,
+ LongArray counts,
+ Path wordsFile,
+ Path countsFile)
{
assert wordIds.size() == counts.size();
@@ -34,7 +35,7 @@ public class ReversePreindexWordSegments {
this.countsFile = countsFile;
}
- /** Returns a long-long hash map where each key is a wordId,
+ /** Returns a long-long hash map where each key is a termId,
* and each value is the start offset of the data.
*/
public Long2LongOpenHashMap asMap(int recordSize) {
@@ -51,14 +52,24 @@ public class ReversePreindexWordSegments {
return ret;
}
- public static ReversePreindexWordSegments construct(IndexJournalReader reader,
- Path wordIdsFile,
- Path countsFile)
+ public static FullPreindexWordSegments construct(IndexJournalPage instance,
+ Path wordIdsFile,
+ Path countsFile)
throws IOException
{
Long2IntOpenHashMap countsMap = new Long2IntOpenHashMap(100_000, 0.75f);
countsMap.defaultReturnValue(0);
- reader.forEachWordId(wordId -> countsMap.addTo(wordId, 1));
+
+ try (var slopTable = new SlopTable(instance.baseDir(), instance.page())) {
+ var termIds = instance.openTermIds(slopTable);
+ while (termIds.hasRemaining()) {
+ long[] tids = termIds.get();
+ for (long termId : tids) {
+ countsMap.addTo(termId, 1);
+ }
+ }
+ }
+
LongArray words = LongArrayFactory.mmapForWritingConfined(wordIdsFile, countsMap.size());
LongArray counts = LongArrayFactory.mmapForWritingConfined(countsFile, countsMap.size());
@@ -79,7 +90,7 @@ public class ReversePreindexWordSegments {
counts.set(i, countsMap.get(words.get(i)));
}
- return new ReversePreindexWordSegments(words, counts, wordIdsFile, countsFile);
+ return new FullPreindexWordSegments(words, counts, wordIdsFile, countsFile);
}
public SegmentIterator iterator(int recordSize) {
@@ -188,7 +199,7 @@ public class ReversePreindexWordSegments {
if (i == fileSize) {
// We've reached the end of the iteration and there is no
- // "next" wordId to fetch
+ // "next" termId to fetch
wordId = Long.MIN_VALUE;
return false;
}
diff --git a/code/index/index-reverse/java/nu/marginalia/index/construction/prio/PrioDocIdsTransformer.java b/code/index/index-reverse/java/nu/marginalia/index/construction/prio/PrioDocIdsTransformer.java
new file mode 100644
index 00000000..3072ffb8
--- /dev/null
+++ b/code/index/index-reverse/java/nu/marginalia/index/construction/prio/PrioDocIdsTransformer.java
@@ -0,0 +1,142 @@
+package nu.marginalia.index.construction.prio;
+
+import nu.marginalia.array.algo.LongArrayTransformations;
+import nu.marginalia.model.id.UrlIdCodec;
+import nu.marginalia.sequence.io.BitWriter;
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+
+import java.io.IOException;
+import java.nio.ByteBuffer;
+import java.nio.ByteOrder;
+import java.nio.channels.FileChannel;
+
+/** Constructs document ids list priority reverse index */
+public class PrioDocIdsTransformer implements LongArrayTransformations.LongIOTransformer, AutoCloseable {
+
+ private static final Logger logger = LoggerFactory.getLogger(PrioDocIdsTransformer.class);
+
+ private final FileChannel writeChannel;
+ private final FileChannel readChannel;
+
+ private final ByteBuffer readBuffer = ByteBuffer.allocate(65536).order(ByteOrder.LITTLE_ENDIAN);
+ private final ByteBuffer writeBuffer = ByteBuffer.allocate(65536);
+
+ long startL = 0;
+ long writeOffsetB = 0;
+
+ public PrioDocIdsTransformer(FileChannel writeChannel,
+ FileChannel readChannel) {
+ this.writeChannel = writeChannel;
+ this.readChannel = readChannel;
+ }
+
+ @Override
+ public long transform(long pos, long endL) throws IOException {
+
+ final int sizeL = (int) ((endL - startL));
+ final long startOffsetB = writeOffsetB;
+
+ if (sizeL == 0)
+ throw new IllegalStateException("Empty range");
+
+ readChannel.position(startL * 8);
+ readBuffer.clear();
+
+ int toBeRead = 8 * (sizeL);
+
+ var bitWriter = new BitWriter(writeBuffer);
+
+ int prevRank = -1;
+ int prevDomainId = -1;
+ int prevDocOrd = -1;
+ boolean wroteHeader = false;
+
+ do {
+ readBuffer.limit(Math.min(readBuffer.capacity(), toBeRead));
+ readChannel.read(readBuffer);
+ readBuffer.flip();
+
+ if (!wroteHeader) {
+ // write 11b header
+ bitWriter.putBits(3, 2);
+ // encode number of items
+ bitWriter.putBits(sizeL, 30);
+
+
+ long firstItem = readBuffer.getLong();
+
+ prevRank = UrlIdCodec.getRank(firstItem);
+ prevDomainId = UrlIdCodec.getDomainId(firstItem);
+ prevDocOrd = UrlIdCodec.getDocumentOrdinal(firstItem);
+
+ bitWriter.putBits(prevRank, 7);
+ bitWriter.putBits(prevDomainId, 31);
+ bitWriter.putBits(prevDocOrd, 26);
+
+ wroteHeader = true;
+ }
+
+ while (readBuffer.hasRemaining()) {
+ if (writeBuffer.remaining() < 16) {
+ writeBuffer.flip();
+ int written = writeChannel.write(writeBuffer, writeOffsetB);
+ writeOffsetB += written;
+ writeBuffer.clear();
+ }
+
+ long nextId = readBuffer.getLong();
+
+ // break down id components
+ int rank = UrlIdCodec.getRank(nextId);
+ int domainId = UrlIdCodec.getDomainId(nextId);
+ int docOrd = UrlIdCodec.getDocumentOrdinal(nextId);
+
+ // encode components
+ if (rank != prevRank) {
+ bitWriter.putBits(0b10, 2);
+ bitWriter.putGamma(rank - prevRank);
+ bitWriter.putBits(domainId, 31);
+ bitWriter.putBits(docOrd, 26);
+ }
+ else if (domainId != prevDomainId) {
+ bitWriter.putBits(0b01, 2);
+ bitWriter.putDelta(domainId - prevDomainId);
+ bitWriter.putDelta(1 + docOrd);
+ }
+ else if (docOrd != prevDocOrd) {
+ bitWriter.putBits(0b00, 2);
+ bitWriter.putGamma(docOrd - prevDocOrd);
+ }
+ else {
+ logger.warn("Unexpected duplicate document id: {}", nextId);
+ }
+
+ prevDocOrd = docOrd;
+ prevDomainId = domainId;
+ prevRank = rank;
+
+ }
+
+ toBeRead -= readBuffer.limit();
+ readBuffer.clear();
+ } while (toBeRead > 0);
+
+ // write lingering data
+
+ // ensure any half-written data is flushed to the buffer
+ bitWriter.finishLastByte();
+
+ // update the start input pointer
+ startL = endL;
+ return startOffsetB;
+ }
+
+ @Override
+ public void close() throws IOException {
+ writeBuffer.flip();
+ int written = writeChannel.write(writeBuffer, writeOffsetB);
+ writeOffsetB += written;
+ writeBuffer.clear();
+ }
+}
diff --git a/code/index/index-reverse/java/nu/marginalia/index/construction/prio/PrioIndexConstructor.java b/code/index/index-reverse/java/nu/marginalia/index/construction/prio/PrioIndexConstructor.java
new file mode 100644
index 00000000..cddad7a4
--- /dev/null
+++ b/code/index/index-reverse/java/nu/marginalia/index/construction/prio/PrioIndexConstructor.java
@@ -0,0 +1,108 @@
+package nu.marginalia.index.construction.prio;
+
+import lombok.SneakyThrows;
+import nu.marginalia.index.construction.DocIdRewriter;
+import nu.marginalia.index.journal.IndexJournal;
+import nu.marginalia.index.journal.IndexJournalPage;
+import nu.marginalia.process.control.ProcessHeartbeat;
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+
+import java.io.IOException;
+import java.nio.file.Path;
+import java.util.concurrent.atomic.AtomicInteger;
+
+public class PrioIndexConstructor {
+
+ private static final Logger logger = LoggerFactory.getLogger(PrioIndexConstructor.class);
+
+ public enum CreateReverseIndexSteps {
+ CONSTRUCT,
+ FINALIZE,
+ FINISHED
+ }
+
+ private final Path outputFileDocs;
+ private final Path outputFileWords;
+ private final DocIdRewriter docIdRewriter;
+ private final Path tmpDir;
+
+ public PrioIndexConstructor(Path outputFileDocs,
+ Path outputFileWords,
+ DocIdRewriter docIdRewriter,
+ Path tmpDir) {
+ this.outputFileDocs = outputFileDocs;
+ this.outputFileWords = outputFileWords;
+ this.docIdRewriter = docIdRewriter;
+ this.tmpDir = tmpDir;
+ }
+
+ public void createReverseIndex(ProcessHeartbeat processHeartbeat,
+ String processName,
+ Path sourceBaseDir) throws IOException
+ {
+ var journal = IndexJournal.findJournal(sourceBaseDir);
+ if (journal.isEmpty()) {
+ logger.error("No journal files in base dir {}", sourceBaseDir);
+ return;
+ }
+
+ try (var heartbeat = processHeartbeat.createProcessTaskHeartbeat(CreateReverseIndexSteps.class, processName);
+ var preindexHeartbeat = processHeartbeat.createAdHocTaskHeartbeat("constructPreindexes")
+ ) {
+ heartbeat.progress(CreateReverseIndexSteps.CONSTRUCT);
+
+ AtomicInteger progress = new AtomicInteger(0);
+
+ var journalVersions = journal.get().pages();
+
+ journalVersions
+ .parallelStream()
+ .map(in -> {
+ preindexHeartbeat.progress("PREINDEX/MERGE", progress.incrementAndGet(), journalVersions.size());
+ return construct(in);
+ })
+ .reduce(this::merge)
+ .ifPresent((index) -> {
+ heartbeat.progress(CreateReverseIndexSteps.FINALIZE);
+ finalizeIndex(index);
+ heartbeat.progress(CreateReverseIndexSteps.FINISHED);
+ });
+
+ heartbeat.progress(CreateReverseIndexSteps.FINISHED);
+ }
+ }
+
+ @SneakyThrows
+ private PrioPreindexReference construct(IndexJournalPage journalInstance) {
+ return PrioPreindex
+ .constructPreindex(journalInstance, docIdRewriter, tmpDir)
+ .closeToReference();
+ }
+
+ @SneakyThrows
+ private PrioPreindexReference merge(PrioPreindexReference leftR, PrioPreindexReference rightR) {
+
+ var left = leftR.open();
+ var right = rightR.open();
+
+ try {
+ return PrioPreindex.merge(tmpDir, left, right).closeToReference();
+ }
+ finally {
+ left.delete();
+ right.delete();
+ }
+
+
+ }
+
+ @SneakyThrows
+ private void finalizeIndex(PrioPreindexReference finalPR) {
+ var finalP = finalPR.open();
+ finalP.finalizeIndex(outputFileDocs, outputFileWords);
+ finalP.delete();
+ }
+
+
+}
diff --git a/code/index/index-reverse/java/nu/marginalia/index/construction/prio/PrioPreindex.java b/code/index/index-reverse/java/nu/marginalia/index/construction/prio/PrioPreindex.java
new file mode 100644
index 00000000..3b971288
--- /dev/null
+++ b/code/index/index-reverse/java/nu/marginalia/index/construction/prio/PrioPreindex.java
@@ -0,0 +1,298 @@
+package nu.marginalia.index.construction.prio;
+
+import nu.marginalia.array.LongArray;
+import nu.marginalia.array.LongArrayFactory;
+import nu.marginalia.btree.BTreeWriter;
+import nu.marginalia.index.ReverseIndexParameters;
+import nu.marginalia.index.construction.CountToOffsetTransformer;
+import nu.marginalia.index.construction.DocIdRewriter;
+import nu.marginalia.index.journal.IndexJournalPage;
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+
+import java.io.IOException;
+import java.nio.channels.FileChannel;
+import java.nio.file.Files;
+import java.nio.file.Path;
+import java.nio.file.StandardOpenOption;
+
+import static nu.marginalia.array.algo.TwoArrayOperations.countDistinctElements;
+import static nu.marginalia.array.algo.TwoArrayOperations.mergeArrays;
+
+/** Contains the data that would go into a reverse index,
+ * that is, a mapping from words to documents, minus the actual
+ * index structure that makes the data quick to access while
+ * searching.
+ *
+ * Two preindexes can be merged into a third preindex containing
+ * the union of their data. This operation requires no additional
+ * RAM.
+ */
+public class PrioPreindex {
+ final PrioPreindexWordSegments segments;
+ final PrioPreindexDocuments documents;
+
+ private static final Logger logger = LoggerFactory.getLogger(PrioPreindex.class);
+
+ public PrioPreindex(PrioPreindexWordSegments segments, PrioPreindexDocuments documents) {
+ this.segments = segments;
+ this.documents = documents;
+ }
+
+ /** Constructs a new preindex with the data associated with reader. The backing files
+ * will have randomly assigned names.
+ */
+ public static PrioPreindex constructPreindex(IndexJournalPage indexJournalPage,
+ DocIdRewriter docIdRewriter,
+ Path workDir) throws IOException
+ {
+ Path segmentWordsFile = Files.createTempFile(workDir, "segment_words", ".dat");
+ Path segmentCountsFile = Files.createTempFile(workDir, "segment_counts", ".dat");
+ Path docsFile = Files.createTempFile(workDir, "docs", ".dat");
+
+ var segments = PrioPreindexWordSegments.construct(indexJournalPage, segmentWordsFile, segmentCountsFile);
+ var docs = PrioPreindexDocuments.construct(docsFile, workDir, indexJournalPage, docIdRewriter, segments);
+ return new PrioPreindex(segments, docs);
+ }
+
+ /** Close the associated memory mapped areas and return
+ * a dehydrated page of this object that can be re-opened
+ * later.
+ */
+ public PrioPreindexReference closeToReference() {
+ try {
+ return new PrioPreindexReference(segments, documents);
+ }
+ finally {
+ segments.force();
+ documents.force();
+ segments.close();
+ documents.close();
+ }
+ }
+
+ /** Transform the preindex into a reverse index */
+ public void finalizeIndex(Path outputFileDocs, Path outputFileWords) throws IOException {
+ var offsets = segments.counts;
+
+ Files.deleteIfExists(outputFileDocs);
+ Files.deleteIfExists(outputFileWords);
+
+ // Estimate the size of the docs index data
+ offsets.transformEach(0, offsets.size(), new CountToOffsetTransformer(1));
+
+ // Write the docs file
+ try (var intermediateDocChannel = documents.createDocumentsFileChannel();
+ var destFileChannel = (FileChannel) Files.newByteChannel(outputFileDocs, StandardOpenOption.CREATE_NEW, StandardOpenOption.WRITE);
+ var transformer = new PrioDocIdsTransformer(destFileChannel, intermediateDocChannel)
+ ) {
+ offsets.transformEachIO(0, offsets.size(), transformer);
+ }
+
+ LongArray wordIds = segments.wordIds;
+
+ if (offsets.size() != wordIds.size())
+ throw new IllegalStateException("Offsets and word-ids of different size");
+ if (offsets.size() > Integer.MAX_VALUE) {
+ throw new IllegalStateException("offsets.size() too big!");
+ }
+
+ // Estimate the size of the words index data
+ long wordsSize = ReverseIndexParameters.wordsBTreeContext.calculateSize((int) offsets.size());
+
+ // Construct the tree
+ LongArray wordsArray = LongArrayFactory.mmapForWritingConfined(outputFileWords, wordsSize);
+
+ new BTreeWriter(wordsArray, ReverseIndexParameters.wordsBTreeContext)
+ .write(0, (int) offsets.size(), mapRegion -> {
+ for (long i = 0; i < offsets.size(); i++) {
+ mapRegion.set(2*i, wordIds.get(i));
+ mapRegion.set(2*i + 1, offsets.get(i));
+ }
+ });
+
+ wordsArray.force();
+ wordsArray.close();
+ }
+
+ /** Delete all files associated with this pre-index */
+ public void delete() throws IOException {
+ segments.delete();
+ documents.delete();
+ }
+
+ public static PrioPreindex merge(Path destDir,
+ PrioPreindex left,
+ PrioPreindex right) throws IOException {
+
+ PrioPreindexWordSegments mergingSegment =
+ createMergedSegmentWordFile(destDir, left.segments, right.segments);
+
+ var mergingIter = mergingSegment.constructionIterator(1);
+ var leftIter = left.segments.iterator(1);
+ var rightIter = right.segments.iterator(1);
+
+ Path docsFile = Files.createTempFile(destDir, "docs", ".dat");
+
+ LongArray mergedDocuments = LongArrayFactory.mmapForWritingConfined(docsFile, left.documents.size() + right.documents.size());
+
+ leftIter.next();
+ rightIter.next();
+
+ while (mergingIter.canPutMore()
+ && leftIter.isPositionBeforeEnd()
+ && rightIter.isPositionBeforeEnd())
+ {
+ final long currentWord = mergingIter.wordId;
+
+ if (leftIter.wordId == currentWord && rightIter.wordId == currentWord)
+ {
+ // both inputs have documents for the current word
+ mergeSegments(leftIter, rightIter,
+ left.documents, right.documents,
+ mergedDocuments, mergingIter);
+ }
+ else if (leftIter.wordId == currentWord) {
+ if (!copySegment(leftIter, left.documents, mergingIter, mergedDocuments))
+ break;
+ }
+ else if (rightIter.wordId == currentWord) {
+ if (!copySegment(rightIter, right.documents, mergingIter, mergedDocuments))
+ break;
+ }
+ else assert false : "This should never happen"; // the helvetica scenario
+ }
+
+ if (leftIter.isPositionBeforeEnd()) {
+ while (copySegment(leftIter, left.documents, mergingIter, mergedDocuments));
+ }
+
+ if (rightIter.isPositionBeforeEnd()) {
+ while (copySegment(rightIter, right.documents, mergingIter, mergedDocuments));
+ }
+
+
+ if (leftIter.isPositionBeforeEnd())
+ throw new IllegalStateException("Left has more to go");
+ if (rightIter.isPositionBeforeEnd())
+ throw new IllegalStateException("Right has more to go");
+ if (mergingIter.canPutMore())
+ throw new IllegalStateException("Source iters ran dry before merging iter");
+
+
+ mergingSegment.force();
+
+ // We may have overestimated the size of the merged docs size in the case there were
+ // duplicates in the data, so we need to shrink it to the actual size we wrote.
+
+ mergedDocuments = shrinkMergedDocuments(mergedDocuments,
+ docsFile, mergingSegment.totalSize());
+
+ return new PrioPreindex(
+ mergingSegment,
+ new PrioPreindexDocuments(mergedDocuments, docsFile)
+ );
+ }
+
+ /** Create a segment word file with each word from both inputs, with zero counts for all the data.
+ * This is an intermediate product in merging.
+ */
+ static PrioPreindexWordSegments createMergedSegmentWordFile(Path destDir,
+ PrioPreindexWordSegments left,
+ PrioPreindexWordSegments right) throws IOException {
+ Path segmentWordsFile = Files.createTempFile(destDir, "segment_words", ".dat");
+ Path segmentCountsFile = Files.createTempFile(destDir, "segment_counts", ".dat");
+
+ // We need total size to request a direct LongArray range. Seems slower, but is faster.
+ // ... see LongArray.directRangeIfPossible(long start, long end)
+ long segmentsSize = countDistinctElements(left.wordIds, right.wordIds,
+ 0, left.wordIds.size(),
+ 0, right.wordIds.size());
+
+ LongArray wordIdsFile = LongArrayFactory.mmapForWritingConfined(segmentWordsFile, segmentsSize);
+
+ mergeArrays(wordIdsFile, left.wordIds, right.wordIds,
+ 0,
+ 0, left.wordIds.size(),
+ 0, right.wordIds.size());
+
+ LongArray counts = LongArrayFactory.mmapForWritingConfined(segmentCountsFile, segmentsSize);
+
+ return new PrioPreindexWordSegments(wordIdsFile, counts, segmentWordsFile, segmentCountsFile);
+ }
+
+ /** It's possible we overestimated the necessary size of the documents file,
+ * this will permit us to shrink it down to the smallest necessary size.
+ */
+ private static LongArray shrinkMergedDocuments(LongArray mergedDocuments, Path docsFile, long sizeLongs) throws IOException {
+
+ mergedDocuments.force();
+
+ long beforeSize = mergedDocuments.size();
+ long afterSize = sizeLongs;
+ if (beforeSize != afterSize) {
+ mergedDocuments.close();
+ try (var bc = Files.newByteChannel(docsFile, StandardOpenOption.WRITE)) {
+ bc.truncate(sizeLongs * 8);
+ }
+
+ logger.info("Shrunk {} from {}b to {}b", docsFile, beforeSize, afterSize);
+ mergedDocuments = LongArrayFactory.mmapForWritingConfined(docsFile, sizeLongs);
+ }
+
+ return mergedDocuments;
+ }
+
+ /** Merge contents of the segments indicated by leftIter and rightIter into the destionation
+ * segment, and advance the construction iterator with the appropriate size.
+ */
+ private static void mergeSegments(PrioPreindexWordSegments.SegmentIterator leftIter,
+ PrioPreindexWordSegments.SegmentIterator rightIter,
+ PrioPreindexDocuments left,
+ PrioPreindexDocuments right,
+ LongArray dest,
+ PrioPreindexWordSegments.SegmentConstructionIterator destIter)
+ {
+ long segSize = mergeArrays(dest,
+ left.documents,
+ right.documents,
+ destIter.startOffset,
+ leftIter.startOffset, leftIter.endOffset,
+ rightIter.startOffset, rightIter.endOffset);
+
+ destIter.putNext(segSize);
+ leftIter.next();
+ rightIter.next();
+ }
+
+ /** Copy the data from the source segment at the position and length indicated by sourceIter,
+ * into the destination segment, and advance the construction iterator.
+ */
+ /** Copy the data from the source segment at the position and length indicated by sourceIter,
+ * into the destination segment, and advance the construction iterator.
+ */
+ private static boolean copySegment(PrioPreindexWordSegments.SegmentIterator sourceIter,
+ PrioPreindexDocuments srcDocuments,
+ PrioPreindexWordSegments.SegmentConstructionIterator mergingIter,
+ LongArray dest) throws IOException {
+
+ long size = sourceIter.endOffset - sourceIter.startOffset;
+ long start = mergingIter.startOffset;
+ long end = start + size;
+
+ dest.transferFrom(srcDocuments.documents,
+ sourceIter.startOffset,
+ mergingIter.startOffset,
+ end);
+
+ boolean putNext = mergingIter.putNext(size);
+ boolean iterNext = sourceIter.next();
+
+ if (!putNext && iterNext)
+ throw new IllegalStateException("Source iterator ran out before dest iterator?!");
+
+ return iterNext;
+ }
+
+
+}
diff --git a/code/index/index-reverse/java/nu/marginalia/index/construction/prio/PrioPreindexDocuments.java b/code/index/index-reverse/java/nu/marginalia/index/construction/prio/PrioPreindexDocuments.java
new file mode 100644
index 00000000..d9290e14
--- /dev/null
+++ b/code/index/index-reverse/java/nu/marginalia/index/construction/prio/PrioPreindexDocuments.java
@@ -0,0 +1,125 @@
+package nu.marginalia.index.construction.prio;
+
+import lombok.SneakyThrows;
+import nu.marginalia.array.LongArray;
+import nu.marginalia.array.LongArrayFactory;
+import nu.marginalia.index.construction.DocIdRewriter;
+import nu.marginalia.index.journal.IndexJournalPage;
+import nu.marginalia.rwf.RandomFileAssembler;
+import nu.marginalia.slop.SlopTable;
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+
+import java.io.IOException;
+import java.nio.channels.FileChannel;
+import java.nio.file.Files;
+import java.nio.file.Path;
+import java.nio.file.StandardOpenOption;
+
+/** A LongArray with document data, segmented according to
+ * the associated FullPreindexWordSegments data
+ */
+public class PrioPreindexDocuments {
+ public final LongArray documents;
+
+ private static final int RECORD_SIZE_LONGS = 1;
+ private static final Logger logger = LoggerFactory.getLogger(PrioPreindexDocuments.class);
+
+ public final Path file;
+
+ public PrioPreindexDocuments(LongArray documents, Path file) {
+ this.documents = documents;
+ this.file = file;
+ }
+
+ public static PrioPreindexDocuments construct(
+ Path docsFile,
+ Path workDir,
+ IndexJournalPage journalInstance,
+ DocIdRewriter docIdRewriter,
+ PrioPreindexWordSegments segments) throws IOException {
+
+ createUnsortedDocsFile(docsFile, workDir, journalInstance, segments, docIdRewriter);
+
+ LongArray docsFileMap = LongArrayFactory.mmapForModifyingShared(docsFile);
+ sortDocsFile(docsFileMap, segments);
+
+ return new PrioPreindexDocuments(docsFileMap, docsFile);
+ }
+
+ public FileChannel createDocumentsFileChannel() throws IOException {
+ return (FileChannel) Files.newByteChannel(file, StandardOpenOption.READ);
+ }
+
+
+ public long size() {
+ return documents.size();
+ }
+
+ private static void createUnsortedDocsFile(Path docsFile,
+ Path workDir,
+ IndexJournalPage instance,
+ PrioPreindexWordSegments segments,
+ DocIdRewriter docIdRewriter) throws IOException {
+
+ long fileSizeLongs = RECORD_SIZE_LONGS * segments.totalSize();
+
+ try (var assembly = RandomFileAssembler.create(workDir, fileSizeLongs);
+ var slopTable = new SlopTable(instance.baseDir(), instance.page()))
+ {
+ var docIds = instance.openCombinedId(slopTable);
+ var termIds = instance.openTermIds(slopTable);
+ var termMeta = instance.openTermMetadata(slopTable);
+
+ var offsetMap = segments.asMap(RECORD_SIZE_LONGS);
+ offsetMap.defaultReturnValue(0);
+
+
+ while (docIds.hasRemaining()) {
+ long docId = docIds.get();
+ long rankEncodedId = docIdRewriter.rewriteDocId(docId);
+
+ long[] tIds = termIds.get();
+ byte[] tMeta = termMeta.get();
+
+ for (int i = 0; i < tIds.length; i++) {
+ long termId = tIds[i];
+ byte meta = tMeta[i];
+
+ if (meta != 0) {
+ long offset = offsetMap.addTo(termId, RECORD_SIZE_LONGS);
+ assembly.put(offset, rankEncodedId);
+ }
+ }
+ }
+
+ assembly.write(docsFile);
+ }
+ }
+
+ @SneakyThrows
+ private static void sortDocsFile(LongArray docsFileMap, PrioPreindexWordSegments segments) {
+
+ var iter = segments.iterator(RECORD_SIZE_LONGS);
+
+ while (iter.next()) {
+ long iterStart = iter.startOffset;
+ long iterEnd = iter.endOffset;
+
+ docsFileMap.sort(iterStart, iterEnd);
+ }
+ }
+
+ public void delete() throws IOException {
+ Files.delete(this.file);
+ documents.close();
+ }
+
+ public void close() {
+ documents.close();
+ }
+
+ public void force() {
+ documents.force();
+ }
+}
diff --git a/code/index/index-reverse/java/nu/marginalia/index/construction/prio/PrioPreindexReference.java b/code/index/index-reverse/java/nu/marginalia/index/construction/prio/PrioPreindexReference.java
new file mode 100644
index 00000000..f2ccd8df
--- /dev/null
+++ b/code/index/index-reverse/java/nu/marginalia/index/construction/prio/PrioPreindexReference.java
@@ -0,0 +1,36 @@
+package nu.marginalia.index.construction.prio;
+
+import nu.marginalia.array.LongArrayFactory;
+
+import java.io.IOException;
+import java.nio.file.Path;
+
+/** This is a dehydrated page of a PrioPreIndex, that only
+ * keeps references to its location on disk but does not hold associated
+ * memory maps.
+ */
+public record PrioPreindexReference(
+ Path wordsFile,
+ Path countsFile,
+ Path documentsFile
+)
+{
+ public PrioPreindexReference(PrioPreindexWordSegments segments, PrioPreindexDocuments documents) {
+ this(segments.wordsFile, segments.countsFile, documents.file);
+ }
+
+ public PrioPreindex open() throws IOException {
+ return new PrioPreindex(
+ new PrioPreindexWordSegments(
+ LongArrayFactory.mmapForModifyingShared(wordsFile),
+ LongArrayFactory.mmapForModifyingShared(countsFile),
+ wordsFile,
+ countsFile
+ ),
+ new PrioPreindexDocuments(
+ LongArrayFactory.mmapForModifyingShared(documentsFile),
+ documentsFile
+ )
+ );
+ }
+}
diff --git a/code/index/index-reverse/java/nu/marginalia/index/construction/prio/PrioPreindexWordSegments.java b/code/index/index-reverse/java/nu/marginalia/index/construction/prio/PrioPreindexWordSegments.java
new file mode 100644
index 00000000..69c5ea61
--- /dev/null
+++ b/code/index/index-reverse/java/nu/marginalia/index/construction/prio/PrioPreindexWordSegments.java
@@ -0,0 +1,221 @@
+package nu.marginalia.index.construction.prio;
+
+import it.unimi.dsi.fastutil.longs.Long2IntOpenHashMap;
+import it.unimi.dsi.fastutil.longs.Long2LongOpenHashMap;
+import it.unimi.dsi.fastutil.longs.LongIterator;
+import nu.marginalia.array.LongArray;
+import nu.marginalia.array.LongArrayFactory;
+import nu.marginalia.index.journal.IndexJournalPage;
+import nu.marginalia.slop.SlopTable;
+
+import java.io.IOException;
+import java.nio.file.Files;
+import java.nio.file.Path;
+
+/** A pair of file-backed arrays of sorted wordIds
+ * and the count of documents associated with each termId.
+ */
+public class PrioPreindexWordSegments {
+ public final LongArray wordIds;
+ public final LongArray counts;
+
+ final Path wordsFile;
+ final Path countsFile;
+
+ public PrioPreindexWordSegments(LongArray wordIds,
+ LongArray counts,
+ Path wordsFile,
+ Path countsFile)
+ {
+ assert wordIds.size() == counts.size();
+
+ this.wordIds = wordIds;
+ this.counts = counts;
+ this.wordsFile = wordsFile;
+ this.countsFile = countsFile;
+ }
+
+ /** Returns a long-long hash map where each key is a termId,
+ * and each value is the start offset of the data.
+ */
+ public Long2LongOpenHashMap asMap(int recordSize) {
+ if (wordIds.size() > Integer.MAX_VALUE)
+ throw new IllegalArgumentException("Cannot create a map with more than Integer.MAX_VALUE entries");
+
+ Long2LongOpenHashMap ret = new Long2LongOpenHashMap((int) wordIds.size(), 0.75f);
+ var iter = iterator(recordSize);
+
+ while (iter.next()) {
+ ret.put(iter.wordId, iter.startOffset);
+ }
+
+ return ret;
+ }
+
+ public static PrioPreindexWordSegments construct(IndexJournalPage instance,
+ Path wordIdsFile,
+ Path countsFile)
+ throws IOException
+ {
+ Long2IntOpenHashMap countsMap = new Long2IntOpenHashMap(100_000, 0.75f);
+ countsMap.defaultReturnValue(0);
+
+ try (var slopTable = new SlopTable(instance.baseDir(), instance.page())) {
+ var termIds = instance.openTermIds(slopTable);
+ var termMetas = instance.openTermMetadata(slopTable);
+
+ while (termIds.hasRemaining()) {
+ long[] data = termIds.get();
+ byte[] meta = termMetas.get();
+
+ for (int i = 0; i < data.length; i++) {
+ if (meta[i] != 0) {
+ countsMap.addTo(data[i], 1);
+ }
+ }
+ }
+ }
+
+ LongArray words = LongArrayFactory.mmapForWritingConfined(wordIdsFile, countsMap.size());
+ LongArray counts = LongArrayFactory.mmapForWritingConfined(countsFile, countsMap.size());
+
+ // Create the words file by iterating over the map and inserting them into
+ // the words file in whatever bizarro hash table order they appear in
+ long i = 0;
+ LongIterator iter = countsMap.keySet().iterator();
+ while (iter.hasNext()) {
+ words.set(i++, iter.nextLong());
+ }
+
+ // Sort the words file
+ words.sort(0, counts.size());
+
+ // Populate the counts
+ for (i = 0; i < countsMap.size(); i++) {
+ counts.set(i, countsMap.get(words.get(i)));
+ }
+
+ return new PrioPreindexWordSegments(words, counts, wordIdsFile, countsFile);
+ }
+
+ public SegmentIterator iterator(int recordSize) {
+ return new SegmentIterator(recordSize);
+ }
+ public SegmentConstructionIterator constructionIterator(int recordSize) {
+ return new SegmentConstructionIterator(recordSize);
+ }
+
+ public long totalSize() {
+ return counts.fold(0, 0, counts.size(), Long::sum);
+ }
+
+ public void delete() throws IOException {
+ Files.delete(countsFile);
+ Files.delete(wordsFile);
+
+ counts.close();
+ wordIds.close();
+ }
+
+ public void force() {
+ counts.force();
+ wordIds.force();
+ }
+
+ public void close() {
+ wordIds.close();
+ counts.close();
+ }
+
+ public class SegmentIterator {
+ private final int recordSize;
+ private final long fileSize;
+ long wordId;
+ long startOffset = 0;
+ long endOffset = 0;
+
+ private SegmentIterator(int recordSize) {
+ this.recordSize = recordSize;
+ this.fileSize = wordIds.size();
+ }
+
+ private long i = -1;
+ public long idx() {
+ return i;
+ }
+ public boolean next() {
+ if (++i >= fileSize) {
+ wordId = Long.MIN_VALUE;
+ return false;
+ }
+
+ wordId = wordIds.get(i);
+ startOffset = endOffset;
+ endOffset = startOffset + recordSize * counts.get(i);
+
+ return true;
+ }
+
+ public boolean hasMorePositions() {
+ return i + 1 < wordIds.size();
+ }
+
+ public boolean isPositionBeforeEnd() {
+ return i < wordIds.size();
+ }
+
+ public long size() {
+ return endOffset - startOffset;
+ }
+ }
+
+ class SegmentConstructionIterator {
+ private final int recordSize;
+ private final long fileSize;
+ long wordId;
+ long startOffset = 0;
+ long endOffset = 0;
+
+ private SegmentConstructionIterator(int recordSize) {
+ this.recordSize = recordSize;
+ this.fileSize = wordIds.size();
+ if (fileSize == 0) {
+ throw new IllegalArgumentException("Cannot construct zero-length word segment file");
+ }
+ this.wordId = wordIds.get(0);
+ }
+
+ private long i = 0;
+ public long idx() {
+ return i;
+ }
+
+ public boolean putNext(long size) {
+
+ if (i >= fileSize)
+ return false;
+
+ endOffset = startOffset + recordSize * size;
+ counts.set(i, size);
+ startOffset = endOffset;
+ endOffset = -1;
+
+ i++;
+
+ if (i == fileSize) {
+ // We've reached the end of the iteration and there is no
+ // "next" termId to fetch
+ wordId = Long.MIN_VALUE;
+ return false;
+ }
+ else {
+ wordId = wordIds.get(i);
+ return true;
+ }
+ }
+
+ public boolean canPutMore() {
+ return i < wordIds.size();
+ }
+ }
+}
diff --git a/code/index/index-reverse/java/nu/marginalia/index/positions/PositionCodec.java b/code/index/index-reverse/java/nu/marginalia/index/positions/PositionCodec.java
new file mode 100644
index 00000000..9df63eec
--- /dev/null
+++ b/code/index/index-reverse/java/nu/marginalia/index/positions/PositionCodec.java
@@ -0,0 +1,25 @@
+package nu.marginalia.index.positions;
+
+/** A utility class for encoding and decoding position data offsets,
+ * the data is encoded by using the highest 16 bits to store the offset,
+ * and the remaining 48 bits to store the size of the data.
+ *
+ * This lets us address 256 TB of data, with up to 64 KB of position data for each term,
+ * which is ample headroom for both the size of the data and the number of positions.
+ * */
+public class PositionCodec {
+
+ public static long encode(int length, long offset) {
+ assert decodeSize(offset) == 0 : "Offset must be less than 2^48";
+
+ return (long) length << 48 | offset;
+ }
+
+ public static int decodeSize(long sizeEncodedOffset) {
+ return (int) ((sizeEncodedOffset & 0xFFFF_0000_0000_0000L) >>> 48);
+ }
+ public static long decodeOffset(long sizeEncodedOffset) {
+ return sizeEncodedOffset & 0x0000_FFFF_FFFF_FFFFL;
+ }
+
+}
diff --git a/code/index/index-reverse/java/nu/marginalia/index/positions/PositionsFileReader.java b/code/index/index-reverse/java/nu/marginalia/index/positions/PositionsFileReader.java
new file mode 100644
index 00000000..43418155
--- /dev/null
+++ b/code/index/index-reverse/java/nu/marginalia/index/positions/PositionsFileReader.java
@@ -0,0 +1,43 @@
+package nu.marginalia.index.positions;
+
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+
+import java.io.IOException;
+import java.lang.foreign.Arena;
+import java.nio.channels.FileChannel;
+import java.nio.file.Path;
+import java.nio.file.StandardOpenOption;
+
+public class PositionsFileReader implements AutoCloseable {
+ private final FileChannel positions;
+ private static final Logger logger = LoggerFactory.getLogger(PositionsFileReader.class);
+
+ public PositionsFileReader(Path positionsFile) throws IOException {
+ this.positions = FileChannel.open(positionsFile, StandardOpenOption.READ);
+ }
+
+ /** Get the positions for a term in the index, as pointed out by the encoded offset;
+ * intermediate buffers are allocated from the provided arena allocator. */
+ public TermData getTermData(Arena arena, long sizeEncodedOffset) {
+ int length = PositionCodec.decodeSize(sizeEncodedOffset);
+ long offset = PositionCodec.decodeOffset(sizeEncodedOffset);
+
+ var segment = arena.allocate(length);
+ var buffer = segment.asByteBuffer();
+
+ try {
+ positions.read(buffer, offset);
+ } catch (IOException e) {
+ throw new RuntimeException(e);
+ }
+
+ return new TermData(buffer);
+ }
+
+ @Override
+ public void close() throws IOException {
+ positions.close();
+ }
+
+}
diff --git a/code/index/index-reverse/java/nu/marginalia/index/positions/TermData.java b/code/index/index-reverse/java/nu/marginalia/index/positions/TermData.java
new file mode 100644
index 00000000..737f10f1
--- /dev/null
+++ b/code/index/index-reverse/java/nu/marginalia/index/positions/TermData.java
@@ -0,0 +1,22 @@
+package nu.marginalia.index.positions;
+
+import nu.marginalia.sequence.CodedSequence;
+import nu.marginalia.sequence.VarintCodedSequence;
+
+import java.nio.ByteBuffer;
+
+public class TermData {
+ private final ByteBuffer buffer;
+
+ public TermData(ByteBuffer buffer) {
+ this.buffer = buffer;
+ }
+
+ public byte flags() {
+ return buffer.get(0);
+ }
+
+ public CodedSequence positions() {
+ return new VarintCodedSequence(buffer, 1, buffer.capacity());
+ }
+}
diff --git a/code/index/index-reverse/readme.md b/code/index/index-reverse/readme.md
index fcc4fcfc..0874bf8d 100644
--- a/code/index/index-reverse/readme.md
+++ b/code/index/index-reverse/readme.md
@@ -7,7 +7,10 @@ There are two tiers of this index.
* A priority index which only indexes terms that are flagged with priority flags1.
* A full index that indexes all terms.
-The full index also provides access to term-level metadata, while the priority index is a binary index that only offers information about which documents has a specific word.
+The full index also provides access to term-level metadata, while the priority index is
+a binary index that only offers information about which documents has a specific word.
+
+The priority index is also compressed, while the full index at this point is not.
[1] See WordFlags in [common/model](../../common/model/) and
KeywordMetadata in [features-convert/keyword-extraction](../../features-convert/keyword-extraction).
@@ -34,9 +37,16 @@ to form a finalized reverse index.
data:image/s3,"s3://crabby-images/806f0/806f0bbd09e11772bd1cc48c731c763046fb1608" alt="Illustration of the data layout of the finalized index"
## Central Classes
-* [ReversePreindex](java/nu/marginalia/index/construction/ReversePreindex.java) intermediate reverse index state.
-* [ReverseIndexConstructor](java/nu/marginalia/index/construction/ReverseIndexConstructor.java) constructs the index.
-* [ReverseIndexReader](java/nu/marginalia/index/ReverseIndexReader.java) interrogates the index.
+Full index:
+* [FullPreindex](java/nu/marginalia/index/construction/full/FullPreindex.java) intermediate reverse index state.
+* [FullIndexConstructor](java/nu/marginalia/index/construction/full/FullIndexConstructor.java) constructs the index.
+* [FullReverseIndexReader](java/nu/marginalia/index/FullReverseIndexReader.java) interrogates the index.
+
+Prio index:
+* [PrioPreindex](java/nu/marginalia/index/construction/prio/PrioPreindex.java) intermediate reverse index state.
+* [PrioIndexConstructor](java/nu/marginalia/index/construction/prio/PrioIndexConstructor.java) constructs the index.
+* [PrioIndexReader](java/nu/marginalia/index/PrioReverseIndexReader.java) interrogates the index.
+
## See Also
diff --git a/code/index/index-reverse/test/nu/marginalia/index/FullReverseIndexReaderTest.java b/code/index/index-reverse/test/nu/marginalia/index/FullReverseIndexReaderTest.java
new file mode 100644
index 00000000..d77d2133
--- /dev/null
+++ b/code/index/index-reverse/test/nu/marginalia/index/FullReverseIndexReaderTest.java
@@ -0,0 +1,119 @@
+package nu.marginalia.index;
+
+import it.unimi.dsi.fastutil.ints.IntList;
+import nu.marginalia.array.page.LongQueryBuffer;
+import nu.marginalia.hash.MurmurHash3_128;
+import nu.marginalia.index.construction.DocIdRewriter;
+import nu.marginalia.index.construction.PositionsFileConstructor;
+import nu.marginalia.index.construction.full.FullPreindex;
+import nu.marginalia.index.construction.full.TestJournalFactory;
+import nu.marginalia.index.construction.full.TestJournalFactory.EntryDataWithWordMeta;
+import nu.marginalia.index.positions.PositionsFileReader;
+import org.junit.jupiter.api.AfterEach;
+import org.junit.jupiter.api.BeforeEach;
+import org.junit.jupiter.api.Test;
+
+import java.io.IOException;
+import java.lang.foreign.Arena;
+import java.nio.file.Files;
+import java.nio.file.Path;
+import java.util.ArrayList;
+import java.util.List;
+
+import static nu.marginalia.index.construction.full.TestJournalFactory.wm;
+import static org.junit.jupiter.api.Assertions.*;
+
+class FullReverseIndexReaderTest {
+ TestJournalFactory journalFactory;
+ Path tempDir;
+
+ @BeforeEach
+ public void setUp() throws IOException {
+ journalFactory = new TestJournalFactory();
+
+ tempDir = Files.createTempDirectory("sort");
+ }
+
+ @AfterEach
+ public void tearDown() throws IOException {
+ journalFactory.clear();
+
+ List contents = new ArrayList<>();
+ Files.list(tempDir).forEach(contents::add);
+ for (var tempFile : contents) {
+ Files.delete(tempFile);
+ }
+ Files.delete(tempDir);
+ }
+
+ MurmurHash3_128 hash = new MurmurHash3_128();
+ long termId(String keyword) {
+ return hash.hashKeyword(keyword);
+ }
+
+ @Test
+ public void testSimple() throws IOException {
+
+ var indexReader = createIndex(
+ new EntryDataWithWordMeta(100, 101, wm(50, 51, 1, 3, 5))
+ );
+
+ assertEquals(1, indexReader.numDocuments(termId("50")));
+
+ var positions = indexReader.getTermData(Arena.global(), termId("50"), new long[] { 100 });
+
+ assertEquals(1, positions.length);
+ assertNotNull(positions[0]);
+ assertEquals((byte) 51, positions[0].flags());
+ assertEquals(IntList.of(1, 3, 5), positions[0].positions().values());
+
+ assertArrayEquals(new long[] { 100 }, readEntries(indexReader, termId("50")));
+ }
+
+
+ @Test
+ public void test2x2() throws IOException {
+
+ var indexReader = createIndex(
+ new EntryDataWithWordMeta(100, 101, wm(50, 51), wm(51, 52)),
+ new EntryDataWithWordMeta(101, 101, wm(51, 53), wm(52, 54))
+ );
+
+ assertEquals(1, indexReader.numDocuments(termId("50")));
+ assertEquals(2, indexReader.numDocuments(termId("51")));
+ assertEquals(1, indexReader.numDocuments(termId("52")));
+
+ assertArrayEquals(new long[] { 100 }, readEntries(indexReader, termId("50")));
+ assertArrayEquals(new long[] { 100, 101 }, readEntries(indexReader, termId("51")));
+ assertArrayEquals(new long[] { 101 }, readEntries(indexReader, termId("52")));
+
+ }
+
+ private long[] readEntries(FullReverseIndexReader reader, long wordId) {
+ var es = reader.documents(wordId);
+ assertTrue(es.hasMore());
+ LongQueryBuffer buffer = new LongQueryBuffer(4);
+ es.read(buffer);
+ assertFalse(es.hasMore());
+ return buffer.copyData();
+ }
+
+ private FullReverseIndexReader createIndex(EntryDataWithWordMeta... scenario) throws IOException {
+ var reader = journalFactory.createReader(scenario);
+
+ Path posFile = tempDir.resolve("positions.dat");
+ Path docsFile = tempDir.resolve("docs.dat");
+ Path wordsFile = tempDir.resolve("words.dat");
+
+ try (var positionsFileConstructor = new PositionsFileConstructor(posFile)) {
+ var preindex = FullPreindex.constructPreindex(reader,
+ positionsFileConstructor,
+ DocIdRewriter.identity(), tempDir);
+ preindex.finalizeIndex(docsFile, wordsFile);
+ preindex.delete();
+ }
+
+ return new FullReverseIndexReader("test", wordsFile, docsFile, new PositionsFileReader(posFile));
+
+ }
+}
\ No newline at end of file
diff --git a/code/index/index-reverse/test/nu/marginalia/index/PositionsFileReaderTest.java b/code/index/index-reverse/test/nu/marginalia/index/PositionsFileReaderTest.java
new file mode 100644
index 00000000..6d512333
--- /dev/null
+++ b/code/index/index-reverse/test/nu/marginalia/index/PositionsFileReaderTest.java
@@ -0,0 +1,63 @@
+package nu.marginalia.index;
+
+import it.unimi.dsi.fastutil.ints.IntList;
+import nu.marginalia.index.construction.PositionsFileConstructor;
+import nu.marginalia.index.positions.PositionsFileReader;
+import nu.marginalia.index.positions.TermData;
+import nu.marginalia.sequence.VarintCodedSequence;
+import org.junit.jupiter.api.AfterEach;
+import org.junit.jupiter.api.BeforeEach;
+import org.junit.jupiter.api.Test;
+
+import java.io.IOException;
+import java.lang.foreign.Arena;
+import java.nio.ByteBuffer;
+import java.nio.file.Files;
+import java.nio.file.Path;
+
+import static org.junit.jupiter.api.Assertions.assertEquals;
+
+class PositionsFileReaderTest {
+
+ Path file;
+
+ @BeforeEach
+ void setUp() throws IOException {
+ file = Files.createTempFile("positions", "dat");
+ }
+ @AfterEach
+ void tearDown() throws IOException {
+ Files.delete(file);
+ }
+
+ @Test
+ void getTermData() throws IOException {
+ ByteBuffer workArea = ByteBuffer.allocate(8192);
+ long key1, key2, key3;
+ try (PositionsFileConstructor constructor = new PositionsFileConstructor(file)) {
+ key1 = constructor.add((byte) 43, VarintCodedSequence.generate(1, 2, 3).buffer());
+ key2 = constructor.add((byte) 51, VarintCodedSequence.generate(2, 3, 5, 1000, 5000, 20241).buffer());
+ key3 = constructor.add((byte) 61, VarintCodedSequence.generate(3, 5, 7).buffer());
+ }
+
+ System.out.println("key1: " + Long.toHexString(key1));
+ System.out.println("key2: " + Long.toHexString(key2));
+ System.out.println("key3: " + Long.toHexString(key3));
+
+ try (Arena arena = Arena.ofConfined();
+ PositionsFileReader reader = new PositionsFileReader(file))
+ {
+ TermData data1 = reader.getTermData(arena, key1);
+ assertEquals(43, data1.flags());
+ assertEquals(IntList.of( 1, 2, 3), data1.positions().values());
+
+ TermData data2 = reader.getTermData(arena, key2);
+ assertEquals(51, data2.flags());
+ assertEquals(IntList.of(2, 3, 5, 1000, 5000, 20241), data2.positions().values());
+
+ TermData data3 = reader.getTermData(arena, key3);
+ assertEquals(61, data3.flags());
+ assertEquals(IntList.of(3, 5, 7), data3.positions().values());
+ }
+ }
+}
\ No newline at end of file
diff --git a/code/index/index-reverse/test/nu/marginalia/index/ReverseIndexDebugTest.java b/code/index/index-reverse/test/nu/marginalia/index/ReverseIndexDebugTest.java
index 6f612a06..359e9396 100644
--- a/code/index/index-reverse/test/nu/marginalia/index/ReverseIndexDebugTest.java
+++ b/code/index/index-reverse/test/nu/marginalia/index/ReverseIndexDebugTest.java
@@ -26,7 +26,7 @@ public class ReverseIndexDebugTest {
long wordOffset = wordsBTreeReader.findEntry(problemWord);
assertTrue(wordOffset >= 0);
- var docsReader = new BTreeReader(documents, ReverseIndexParameters.docsBTreeContext, wordOffset);
+ var docsReader = new BTreeReader(documents, ReverseIndexParameters.prioDocsBTreeContext, wordOffset);
// We find problemDoc even though it doesn't exist in the document range
long docOffset = docsReader.findEntry(problemDoc);
diff --git a/code/index/index-reverse/test/nu/marginalia/index/ReverseIndexReaderTest.java b/code/index/index-reverse/test/nu/marginalia/index/ReverseIndexReaderTest.java
deleted file mode 100644
index 265864c4..00000000
--- a/code/index/index-reverse/test/nu/marginalia/index/ReverseIndexReaderTest.java
+++ /dev/null
@@ -1,104 +0,0 @@
-package nu.marginalia.index;
-
-import nu.marginalia.array.page.LongQueryBuffer;
-import nu.marginalia.index.construction.DocIdRewriter;
-import nu.marginalia.index.construction.ReversePreindex;
-import nu.marginalia.index.construction.TestJournalFactory;
-import nu.marginalia.index.construction.TestJournalFactory.EntryDataWithWordMeta;
-import org.junit.jupiter.api.AfterEach;
-import org.junit.jupiter.api.BeforeEach;
-import org.junit.jupiter.api.Test;
-
-import java.io.IOException;
-import java.nio.file.Files;
-import java.nio.file.Path;
-import java.util.ArrayList;
-import java.util.List;
-
-import static nu.marginalia.index.construction.TestJournalFactory.wm;
-import static org.junit.jupiter.api.Assertions.*;
-
-class ReverseIndexReaderTest {
- TestJournalFactory journalFactory;
- Path tempDir;
-
- @BeforeEach
- public void setUp() throws IOException {
- journalFactory = new TestJournalFactory();
-
- tempDir = Files.createTempDirectory("sort");
- }
-
- @AfterEach
- public void tearDown() throws IOException {
- journalFactory.clear();
-
- List contents = new ArrayList<>();
- Files.list(tempDir).forEach(contents::add);
- for (var tempFile : contents) {
- Files.delete(tempFile);
- }
- Files.delete(tempDir);
- }
-
- @Test
- public void testSimple() throws IOException {
-
- var indexReader = createIndex(
- new EntryDataWithWordMeta(100, 101, wm(50, 51))
- );
-
- assertEquals(1, indexReader.numDocuments(50));
-
- long[] meta = indexReader.getTermMeta(50, new long[] { 100 });
- assertArrayEquals(new long[] { 51 }, meta);
- assertArrayEquals(new long[] { 100 }, readEntries(indexReader, 50));
- }
-
- @Test
- public void test2x2() throws IOException {
-
- var indexReader = createIndex(
- new EntryDataWithWordMeta(100, 101, wm(50, 51), wm(51, 52)),
- new EntryDataWithWordMeta(101, 101, wm(51, 53), wm(52, 54))
- );
-
- assertEquals(1, indexReader.numDocuments(50));
- assertEquals(2, indexReader.numDocuments(51));
- assertEquals(1, indexReader.numDocuments(52));
-
- assertArrayEquals(new long[] { 51 }, indexReader.getTermMeta(50, new long[] { 100 }));
- assertArrayEquals(new long[] { 100 }, readEntries(indexReader, 50));
-
- assertArrayEquals(new long[] { 52, 53 }, indexReader.getTermMeta(51, new long[] { 100, 101 }));
- assertArrayEquals(new long[] { 100, 101 }, readEntries(indexReader, 51));
-
- assertArrayEquals(new long[] { 54 }, indexReader.getTermMeta(52, new long[] { 101 }));
- assertArrayEquals(new long[] { 101 }, readEntries(indexReader, 52));
-
- }
-
- private long[] readEntries(ReverseIndexReader reader, long wordId) {
- var es = reader.documents(wordId);
- assertTrue(es.hasMore());
- LongQueryBuffer buffer = new LongQueryBuffer(4);
- es.read(buffer);
- assertFalse(es.hasMore());
- return buffer.copyData();
- }
-
- private ReverseIndexReader createIndex(EntryDataWithWordMeta... scenario) throws IOException {
- var reader = journalFactory.createReader(scenario);
- var preindex = ReversePreindex.constructPreindex(reader, DocIdRewriter.identity(), tempDir);
-
-
- Path docsFile = tempDir.resolve("docs.dat");
- Path wordsFile = tempDir.resolve("words.dat");
-
- preindex.finalizeIndex(docsFile, wordsFile);
- preindex.delete();
-
- return new ReverseIndexReader("test", wordsFile, docsFile);
-
- }
-}
\ No newline at end of file
diff --git a/code/index/index-reverse/test/nu/marginalia/index/construction/ReversePreindexMergeTest.java b/code/index/index-reverse/test/nu/marginalia/index/construction/ReversePreindexMergeTest.java
deleted file mode 100644
index 1a173d9a..00000000
--- a/code/index/index-reverse/test/nu/marginalia/index/construction/ReversePreindexMergeTest.java
+++ /dev/null
@@ -1,424 +0,0 @@
-
-package nu.marginalia.index.construction;
-
-import org.junit.jupiter.api.AfterEach;
-import org.junit.jupiter.api.BeforeEach;
-import org.junit.jupiter.api.Test;
-
-import java.io.IOException;
-import java.nio.file.Files;
-import java.nio.file.Path;
-import java.util.*;
-
-import static nu.marginalia.index.construction.TestJournalFactory.*;
-import static org.junit.jupiter.api.Assertions.assertEquals;
-
-class ReversePreindexMergeTest {
- TestJournalFactory journalFactory;
- Path countsFile;
- Path wordsIdFile;
- Path docsFile;
- Path tempDir;
-
- @BeforeEach
- public void setUp() throws IOException {
- journalFactory = new TestJournalFactory();
-
- countsFile = Files.createTempFile("counts", ".dat");
- wordsIdFile = Files.createTempFile("words", ".dat");
- docsFile = Files.createTempFile("docs", ".dat");
- tempDir = Files.createTempDirectory("sort");
- }
-
- @AfterEach
- public void tearDown() throws IOException {
- journalFactory.clear();
-
- Files.deleteIfExists(countsFile);
- Files.deleteIfExists(wordsIdFile);
- List contents = new ArrayList<>();
- Files.list(tempDir).forEach(contents::add);
- for (var tempFile : contents) {
- Files.delete(tempFile);
- }
- Files.delete(tempDir);
- }
-
- public ReversePreindex runMergeScenario(
- List leftData,
- List rightData
- ) throws IOException {
- var reader1 = journalFactory.createReader(leftData.toArray(EntryDataWithWordMeta[]::new));
- var reader2 = journalFactory.createReader(rightData.toArray(EntryDataWithWordMeta[]::new));
-
- var left = ReversePreindex.constructPreindex(reader1, DocIdRewriter.identity(), tempDir);
- var right = ReversePreindex.constructPreindex(reader2, DocIdRewriter.identity(), tempDir);
- return ReversePreindex.merge(tempDir, left, right);
- }
-
- private List getData(ReversePreindex merged) {
- var iter = merged.segments.iterator(2);
- List actual = new ArrayList<>();
- while (iter.next()) {
- long[] data = new long[(int) (iter.endOffset - iter.startOffset)];
- merged.documents.slice(iter.startOffset, iter.endOffset).get(0, data);
- actual.add(new TestSegmentData(iter.wordId, iter.startOffset, iter.endOffset,
- data));
- }
- return actual;
- }
-
- @Test
- public void testDocsMergeSingleNoOverlap() throws IOException {
-
- IdSequence docIds = new IdSequence();
- IdSequence docMetas = new IdSequence();
- IdSequence wordMetas = new IdSequence();
- IdSequence wordIds = new IdSequence();
-
- var leftSequence = List.of(new EntryDataWithWordMeta(docIds.nextUnique(), docMetas.nextUnique(), wm(wordIds.nextUnique(), wordMetas.nextUnique())));
- var rightSequence = List.of(new EntryDataWithWordMeta(docIds.nextUnique(), docMetas.nextUnique(), wm(wordIds.nextUnique(), wordMetas.nextUnique())));
-
- var merged = runMergeScenario(
- leftSequence,
- rightSequence
- );
-
- var actual = getData(merged);
-
- var expected = simulateMerge(leftSequence, rightSequence);
-
- System.out.println(actual);
- assertEquals(expected, actual);
- }
-
- @Test
- public void testDocsMergeSingleOnlyOverlap() throws IOException {
-
- IdSequence docIds = new IdSequence();
- IdSequence docMetas = new IdSequence();
- IdSequence wordMetas = new IdSequence();
- IdSequence wordIds = new IdSequence();
-
- var leftSequence = List.of(new EntryDataWithWordMeta(docIds.nextUnique(), docMetas.nextUnique(), wm(wordIds.nextUnique(), wordMetas.nextUnique())));
- var rightSequence = List.of(new EntryDataWithWordMeta(docIds.nextUnique(), docMetas.nextUnique(), wm(wordIds.alreadySeenSameSequence(), wordMetas.nextUnique())));
-
- var merged = runMergeScenario(
- leftSequence,
- rightSequence
- );
-
- var actual = getData(merged);
-
- var expected = simulateMerge(leftSequence, rightSequence);
-
- System.out.println(actual);
- assertEquals(expected, actual);
- }
-
- @Test
- public void testDocsMergeSingleOnlyOverlap2() throws IOException {
-
- long wid1 = 1;
- long wid2 = 2;
- IdSequence docIds = new IdSequence();
- IdSequence docMetas = new IdSequence();
- IdSequence wordMetas = new IdSequence();
-
- var leftSequence = List.of(new EntryDataWithWordMeta(docIds.nextUnique(), docMetas.nextUnique(),
- wm(wid1, wordMetas.nextUnique()),
- wm(wid2, wordMetas.nextUnique())
- ));
- var rightSequence = List.of(new EntryDataWithWordMeta(docIds.nextUnique(), docMetas.nextUnique(),
- wm(wid1, wordMetas.nextUnique()),
- wm(wid2, wordMetas.nextUnique())
- ));
-
- var merged = runMergeScenario(
- leftSequence,
- rightSequence
- );
-
- var actual = getData(merged);
-
- var expected = simulateMerge(leftSequence, rightSequence);
-
- System.out.println(actual);
- assertEquals(expected, actual);
- }
-
- @Test
- public void testBadCase1() throws IOException {
- long wordId = 0xF00F00BA3L;
-
- List leftSequence = List.of(new EntryDataWithWordMeta(40, 50,
- wm(wordId, 5))
- );
- List rightSequence = List.of(new EntryDataWithWordMeta(41, 51,
- wm(wordId, 3),
- wm(wordId, 4))
- );
-
- var mergedLR = runMergeScenario(
- leftSequence,
- rightSequence
- );
- var mergedRL = runMergeScenario(
- rightSequence,
- leftSequence
- );
-
- var actualLR = getData(mergedLR);
- var actualRL = getData(mergedRL);
-
- var expected = simulateMerge(leftSequence, rightSequence);
-
- assertEquals(actualLR, actualRL);
-
- if (!expected.equals(actualLR)) {
- System.out.println("*fail*");
- System.out.println(leftSequence);
- System.out.println(rightSequence);
- }
- else {
- System.out.println("*pass*");
- }
-
- assertEquals(expected, actualLR);
-
- }
-
- @Test
- public void testBadCase2() throws IOException {
- long wordId = 100;
-
- List leftSequence = List.of(
- new EntryDataWithWordMeta(1, 50, wm(wordId, 5)),
- new EntryDataWithWordMeta(2, 50, wm(wordId, 5))
-
- );
- List rightSequence = List.of(
- new EntryDataWithWordMeta(3, 50, wm(wordId, 5))
- );
-
- var mergedLR = runMergeScenario(
- leftSequence,
- rightSequence
- );
- var mergedRL = runMergeScenario(
- rightSequence,
- leftSequence
- );
-
- var actualLR = getData(mergedLR);
- var actualRL = getData(mergedRL);
-
- var expected = simulateMerge(leftSequence, rightSequence);
-
- assertEquals(actualLR, actualRL);
-
- if (!expected.equals(actualLR)) {
- System.out.println("*fail*");
- System.out.println(leftSequence);
- System.out.println(rightSequence);
- }
- else {
- System.out.println("*pass*");
- }
-
- assertEquals(expected, actualLR);
-
- }
-
- @Test
- public void testFuzz() throws IOException {
- Random r = new Random();
- int maxDocs = 150;
- int maxWords = 160;
- int nIters = 1000;
-
- for (int i = 0; i < nIters; i++) {
- int nLeft = 1 + r.nextInt(maxDocs);
- int nRight = 1 + r.nextInt(maxDocs);
-
- IdSequence docIdsLeft = new IdSequence();
- IdSequence docIdsRight = new IdSequence();
- IdSequence docMetas = new IdSequence();
- IdSequence wordMetas = new IdSequence();
- IdSequence wordIds = new IdSequence();
-
- List leftSequence = new ArrayList<>(nLeft);
- for (int j = 0; j < nLeft; j++) {
- WordWithMeta[] words = new WordWithMeta[maxWords == 1 ? 1 : r.nextInt(1, maxWords)];
- Arrays.setAll(words, idx -> {
- long wordId = wordIds.seenWithP(1.0);
- long wordMeta = wordMetas.nextUniqueAssociatedWithKey(wordId);
- return wm(wordId, wordMeta);
- });
-
- long docId = docIdsLeft.nextUnique();
- long docMeta = docMetas.nextUniqueAssociatedWithKey(docId);
- leftSequence.add(new EntryDataWithWordMeta(docId, docMeta, words));
- }
-
- List rightSequence = new ArrayList<>(nLeft);
- for (int j = 0; j < nRight; j++) {
- WordWithMeta[] words = new WordWithMeta[maxWords == 1 ? 1 : r.nextInt(1, maxWords)];
- Arrays.setAll(words, idx -> {
- long wordId = wordIds.seenWithP(1.0);
- long wordMeta = wordMetas.nextUniqueAssociatedWithKey(wordId);
- return wm(wordId, wordMeta);
- });
-
- long docId = docIdsRight.seenWithP(docIdsLeft, 0.1);
- long docMeta = docMetas.nextUniqueAssociatedWithKey(docId);
- rightSequence.add(new EntryDataWithWordMeta(docId, docMeta, words));
- }
-
- var mergedLR = runMergeScenario(
- leftSequence,
- rightSequence
- );
- var mergedRL = runMergeScenario(
- rightSequence,
- leftSequence
- );
-
- var actualLR = getData(mergedLR);
- var actualRL = getData(mergedRL);
-
- var expected = simulateMerge(leftSequence, rightSequence);
-
- assertEquals(actualLR, actualRL);
-
- if (!expected.equals(actualLR)) {
- System.out.println("*fail*");
- System.out.println(leftSequence);
- System.out.println(rightSequence);
- }
- else {
- System.out.println("*pass*");
- }
-
- assertEquals(expected, actualLR);
-
- }
- }
-
-
- public List simulateMerge(
- Collection leftInputs,
- Collection rightInputs
- ) {
- TreeMap> wordToDocs = new TreeMap<>();
-
- for (var entry : leftInputs) {
- for (var wm : entry.wordIds()) {
- wordToDocs.computeIfAbsent(wm.wordId(), w -> new ArrayList<>()).add(
- new DocWithMeta(entry.docId(), wm.meta())
- );
- }
- }
- for (var entry : rightInputs) {
- for (var wm : entry.wordIds()) {
- wordToDocs.computeIfAbsent(wm.wordId(), w -> new ArrayList<>()).add(
- new DocWithMeta(entry.docId(), wm.meta())
- );
- }
- }
-
- List ret = new ArrayList<>();
- int[] start = new int[1];
- wordToDocs.forEach((wordId, docsList) -> {
- docsList.sort(Comparator.naturalOrder());
- var iter = docsList.iterator();
- DocWithMeta prevVal = null;
- DocWithMeta currentVal;
- while (iter.hasNext()) {
- currentVal = iter.next();
- if (prevVal != null) {
- if (currentVal.docId == prevVal.docId) {
- iter.remove();
- }
- }
- prevVal = currentVal;
-
- }
- long[] data = new long[docsList.size()*2];
- for (int i = 0; i < docsList.size(); i++) {
- data[2*i] = docsList.get(i).docId;
- data[2*i + 1] = docsList.get(i).meta;
- }
- ret.add(new TestSegmentData(wordId, start[0], start[0] + data.length, data));
-
- start[0] += data.length;
- });
- return ret;
- }
-
-
- record DocWithMeta(long docId, long meta) implements Comparable {
-
- @Override
- public int compareTo(DocWithMeta o) {
- return Long.compare(docId, o.docId);
- }
- }
-
- class IdSequence {
- Set seen = new HashSet<>();
- Map associatedValues = new HashMap<>();
- private Random random = new Random();
-
- /** Return alreadySeen() with probability p,
- * else nextUnique()
- */
- public long seenWithP(double p) {
- if (isEmpty() || random.nextDouble() > p)
- return nextUnique();
-
- return alreadySeenSameSequence();
- }
-
- public long seenWithP(IdSequence other, double p) {
- if (isEmpty() || random.nextDouble() > p)
- return nextUnique();
-
- return alreadySeenOtherSequence(other);
- }
-
- public long nextUnique() {
- for (;;) {
- long val = random.nextLong();
- if (seen.add(val)) {
- return val;
- }
- }
- }
-
- public long nextUniqueAssociatedWithKey(long key) {
- return associatedValues.computeIfAbsent(key, k -> nextUnique());
- }
-
- public long alreadySeenSameSequence() {
- long[] values = seen.stream().mapToLong(Long::longValue).toArray();
- int idx = random.nextInt(0, values.length);
- return values[idx];
- }
-
- public long alreadySeenOtherSequence(IdSequence other) {
- List values = new ArrayList<>(other.seen);
- Collections.shuffle(values);
- for (Long maybe : values) {
- if (seen.add(maybe))
- return maybe;
- }
- return nextUnique();
- }
-
- public boolean isEmpty() {
- return seen.isEmpty();
- }
- }
-
-}
\ No newline at end of file
diff --git a/code/index/index-reverse/test/nu/marginalia/index/construction/ReversePreindexWordSegmentsTest.java b/code/index/index-reverse/test/nu/marginalia/index/construction/ReversePreindexWordSegmentsTest.java
deleted file mode 100644
index 0ad3205a..00000000
--- a/code/index/index-reverse/test/nu/marginalia/index/construction/ReversePreindexWordSegmentsTest.java
+++ /dev/null
@@ -1,231 +0,0 @@
-package nu.marginalia.index.construction;
-
-import nu.marginalia.array.LongArray;
-import org.junit.jupiter.api.AfterEach;
-import org.junit.jupiter.api.BeforeEach;
-import org.junit.jupiter.api.Test;
-
-import java.io.IOException;
-import java.nio.file.Files;
-import java.nio.file.Path;
-import java.util.ArrayList;
-import java.util.List;
-
-import static nu.marginalia.index.construction.TestJournalFactory.*;
-import static org.junit.jupiter.api.Assertions.*;
-
-class ReversePreindexWordSegmentsTest {
- Path countsFile;
- Path wordsIdFile;
- Path docsFile;
- Path tempDir;
-
- TestJournalFactory journalFactory;
-
- @BeforeEach
- public void setUp() throws IOException {
- journalFactory = new TestJournalFactory();
-
- countsFile = Files.createTempFile("counts", ".dat");
- wordsIdFile = Files.createTempFile("words", ".dat");
- docsFile = Files.createTempFile("docs", ".dat");
- tempDir = Files.createTempDirectory("sort");
- }
-
- @AfterEach
- public void tearDown() throws IOException {
- journalFactory.clear();
-
- Files.deleteIfExists(countsFile);
- Files.deleteIfExists(wordsIdFile);
- List contents = new ArrayList<>();
- Files.list(tempDir).forEach(contents::add);
- for (var tempFile : contents) {
- Files.delete(tempFile);
- }
- Files.delete(tempDir);
- }
- @Test
- public void testWordSegmentsLongWordId() throws IOException {
- var reader = journalFactory.createReader(
- new EntryData(-0xF00BA3L, 0, 1L<<33)
- );
-
- var segments = ReversePreindexWordSegments.construct(reader, wordsIdFile, countsFile);
- var iter = segments.iterator(1);
-
- List expected = List.of(
- new TestSegmentData(1L<<33, 0, 1)
- );
-
- List actual = new ArrayList<>();
-
- while (iter.next()) {
- actual.add(new TestSegmentData(iter.wordId, iter.startOffset, iter.endOffset));
- }
-
- assertEquals(expected, actual);
- }
- @Test
- public void testWordSegmentsRepeatedWordId() throws IOException {
- var reader = journalFactory.createReader(
- new EntryData(-0xF00BA3L, 0, 5, 5)
- );
-
- var segments = ReversePreindexWordSegments.construct(reader, wordsIdFile, countsFile);
- var iter = segments.iterator(1);
-
- List expected = List.of(
- new TestSegmentData(5, 0, 2)
- );
-
- List actual = new ArrayList<>();
-
- while (iter.next()) {
- actual.add(new TestSegmentData(iter.wordId, iter.startOffset, iter.endOffset));
- }
-
- assertEquals(expected, actual);
- }
-
- @Test
- public void testWordSegments1() throws IOException {
- var reader = journalFactory.createReader(
- new EntryData(-0xF00BA3L, 0, 10, 40, -100, 33)
- );
-
- var segments = ReversePreindexWordSegments.construct(reader, wordsIdFile, countsFile);
- var iter = segments.iterator(1);
-
- List expected = List.of(
- new TestSegmentData(-100, 0, 1),
- new TestSegmentData(10, 1, 2),
- new TestSegmentData(33, 2, 3),
- new TestSegmentData(40, 3, 4)
- );
-
- List actual = new ArrayList<>();
-
- while (iter.next()) {
- actual.add(new TestSegmentData(iter.wordId, iter.startOffset, iter.endOffset));
- }
-
- assertEquals(expected, actual);
- }
-
- @Test
- public void testWordSegments2() throws IOException {
- var reader = journalFactory.createReader(
- new EntryData(-0xF00BA3L, 0, 10, 40, -100, 33),
- new EntryData(0xF00BA4L, 0, 15, 30, -100, 33)
- );
-
- var segments = ReversePreindexWordSegments.construct(reader, wordsIdFile, countsFile);
- var iter = segments.iterator(1);
-
- List expected = List.of(
- new TestSegmentData(-100, 0, 2),
- new TestSegmentData(10, 2, 3),
- new TestSegmentData(15, 3, 4),
- new TestSegmentData(30, 4, 5),
- new TestSegmentData(33, 5, 7),
- new TestSegmentData(40, 7, 8)
- );
-
- List actual = new ArrayList<>();
-
- while (iter.next()) {
- actual.add(new TestSegmentData(iter.wordId, iter.startOffset, iter.endOffset));
- }
-
- assertEquals(expected, actual);
- }
-
-
- @Test
- public void testWordSegments_ReadIterator() {
- LongArray wordsArray = LongArray.allocate(4);
- LongArray countsArray = LongArray.allocate(4);
- wordsArray.set(0, -1, -2, -3, -4);
- countsArray.set(0, 2, 1, 3, 5);
- var segments = new ReversePreindexWordSegments(wordsArray, countsArray, null, null);
-
- var ritr = segments.iterator(1);
- assertTrue(ritr.hasMorePositions());
- assertTrue(ritr.next());
- assertTrue(ritr.isPositionBeforeEnd());
- assertEquals(-1, ritr.wordId);
- assertEquals(0, ritr.idx());
- assertEquals(0, ritr.startOffset);
- assertEquals(2, ritr.endOffset);
-
- assertTrue(ritr.hasMorePositions());
- assertTrue(ritr.next());
- assertTrue(ritr.isPositionBeforeEnd());
- assertEquals(-2, ritr.wordId);
- assertEquals(1, ritr.idx());
- assertEquals(2, ritr.startOffset);
- assertEquals(3, ritr.endOffset);
-
- assertTrue(ritr.hasMorePositions());
- assertTrue(ritr.next());
- assertTrue(ritr.isPositionBeforeEnd());
- assertEquals(-3, ritr.wordId);
- assertEquals(2, ritr.idx());
- assertEquals(3, ritr.startOffset);
- assertEquals(6, ritr.endOffset);
-
- assertTrue(ritr.hasMorePositions());
- assertTrue(ritr.next());
- assertTrue(ritr.isPositionBeforeEnd());
- assertEquals(-4, ritr.wordId);
- assertEquals(3, ritr.idx());
- assertEquals(6, ritr.startOffset);
- assertEquals(11, ritr.endOffset);
-
- assertFalse(ritr.hasMorePositions());
- assertFalse(ritr.next());
- assertFalse(ritr.isPositionBeforeEnd());
-
- assertEquals(Long.MIN_VALUE, ritr.wordId);
- }
-
-
- @Test
- public void testWordSegments_ConstructionIterator() {
- LongArray wordsArray = LongArray.allocate(4);
- LongArray countsArray = LongArray.allocate(4);
- wordsArray.set(0, -1, -2, -3, -4);
- var segments = new ReversePreindexWordSegments(wordsArray, countsArray, null, null);
-
- var citr = segments.constructionIterator(1);
- assertEquals(-1, citr.wordId);
- assertEquals(0, citr.idx());
- assertTrue(citr.canPutMore());
- assertTrue(citr.putNext(1));
- assertEquals(1, countsArray.get(0));
-
- assertEquals(-2, citr.wordId);
- assertEquals(1, citr.idx());
- assertTrue(citr.canPutMore());
- assertTrue(citr.putNext(2));
- assertEquals(2, countsArray.get(1));
-
- assertEquals(-3, citr.wordId);
- assertEquals(2, citr.idx());
- assertTrue(citr.canPutMore());
- assertTrue(citr.putNext(3));
- assertEquals(3, countsArray.get(2));
-
- assertEquals(-4, citr.wordId);
- assertEquals(3, citr.idx());
- assertTrue(citr.canPutMore());
- assertFalse(citr.putNext(4));
- assertEquals(4, countsArray.get(3));
-
- assertEquals(4, citr.idx());
- assertFalse(citr.canPutMore());
- assertEquals(Long.MIN_VALUE, citr.wordId);
- }
-
-}
\ No newline at end of file
diff --git a/code/index/index-reverse/test/nu/marginalia/index/construction/TestJournalFactory.java b/code/index/index-reverse/test/nu/marginalia/index/construction/TestJournalFactory.java
deleted file mode 100644
index b122921b..00000000
--- a/code/index/index-reverse/test/nu/marginalia/index/construction/TestJournalFactory.java
+++ /dev/null
@@ -1,93 +0,0 @@
-package nu.marginalia.index.construction;
-
-import nu.marginalia.index.journal.model.IndexJournalEntryData;
-import nu.marginalia.index.journal.model.IndexJournalEntryHeader;
-import nu.marginalia.index.journal.reader.IndexJournalReader;
-import nu.marginalia.index.journal.reader.IndexJournalReaderSingleFile;
-import nu.marginalia.index.journal.writer.IndexJournalWriterSingleFileImpl;
-
-import java.io.IOException;
-import java.nio.file.Files;
-import java.nio.file.Path;
-import java.util.ArrayList;
-import java.util.Arrays;
-import java.util.List;
-
-public class TestJournalFactory {
- Path tempDir = Files.createTempDirectory("journal");
-
- public TestJournalFactory() throws IOException {}
-
- public void clear() throws IOException {
- List toDelete = new ArrayList<>();
- try (var dirStream = Files.list(tempDir)) {
- dirStream.forEach(toDelete::add);
- }
- for (var tempFile : toDelete) {
- Files.delete(tempFile);
- }
- Files.delete(tempDir);
- }
-
- public record EntryData(long docId, long docMeta, long... wordIds) {
- @Override
- public String toString() {
- return "EntryData{" +
- "docId=" + docId +
- ", docMeta=" + docMeta +
- ", wordIds=" + Arrays.toString(wordIds) +
- '}';
- }
- }
- public record EntryDataWithWordMeta(long docId, long docMeta, WordWithMeta... wordIds) {
- @Override
- public String toString() {
- return "EntryDataWithWordMeta{" +
- "docId=" + docId +
- ", docMeta=" + docMeta +
- ", wordIds=" + Arrays.toString(wordIds) +
- '}';
- }
- }
- public record WordWithMeta(long wordId, long meta) {}
-
- public static WordWithMeta wm(long wordId, long meta) {
- return new WordWithMeta(wordId, meta);
- }
-
- IndexJournalReader createReader(EntryData... entries) throws IOException {
- Path jf = Files.createTempFile(tempDir, "journal", ".dat");
-
- var writer = new IndexJournalWriterSingleFileImpl(jf);
- for (var entry : entries) {
- long[] data = new long[entry.wordIds.length * 2];
- for (int i = 0; i < entry.wordIds.length; i++)
- data[i*2] = entry.wordIds[i];
-
- writer.put(new IndexJournalEntryHeader(entries.length, 0, entry.docId, entry.docMeta),
- new IndexJournalEntryData(data));
- }
- writer.close();
- var ret = new IndexJournalReaderSingleFile(jf);
- return ret;
- }
-
- public IndexJournalReader createReader(EntryDataWithWordMeta... entries) throws IOException {
- Path jf = Files.createTempFile(tempDir, "journal", ".dat");
-
- var writer = new IndexJournalWriterSingleFileImpl(jf);
- for (var entry : entries) {
- long[] data = new long[entry.wordIds.length * 2];
- for (int i = 0; i < entry.wordIds.length; i++) {
- data[i * 2] = entry.wordIds[i].wordId;
- data[i * 2 + 1] = entry.wordIds[i].meta;
- }
-
- writer.put(new IndexJournalEntryHeader(entries.length, 0, entry.docId, entry.docMeta),
- new IndexJournalEntryData(data));
- }
- writer.close();
- var ret = new IndexJournalReaderSingleFile(jf);
- return ret;
- }
-}
diff --git a/code/index/index-reverse/test/nu/marginalia/index/construction/ReversePreindexDocsTest.java b/code/index/index-reverse/test/nu/marginalia/index/construction/full/FullPreindexDocsTest.java
similarity index 65%
rename from code/index/index-reverse/test/nu/marginalia/index/construction/ReversePreindexDocsTest.java
rename to code/index/index-reverse/test/nu/marginalia/index/construction/full/FullPreindexDocsTest.java
index d6d81818..8f6e6a14 100644
--- a/code/index/index-reverse/test/nu/marginalia/index/construction/ReversePreindexDocsTest.java
+++ b/code/index/index-reverse/test/nu/marginalia/index/construction/full/FullPreindexDocsTest.java
@@ -1,5 +1,8 @@
-package nu.marginalia.index.construction;
+package nu.marginalia.index.construction.full;
+import nu.marginalia.hash.MurmurHash3_128;
+import nu.marginalia.index.construction.DocIdRewriter;
+import nu.marginalia.index.construction.PositionsFileConstructor;
import org.junit.jupiter.api.AfterEach;
import org.junit.jupiter.api.BeforeEach;
import org.junit.jupiter.api.Test;
@@ -11,14 +14,15 @@ import java.util.ArrayList;
import java.util.Arrays;
import java.util.List;
-import static nu.marginalia.index.construction.TestJournalFactory.EntryData;
+import static nu.marginalia.index.construction.full.TestJournalFactory.EntryData;
import static org.junit.jupiter.api.Assertions.assertEquals;
-class ReversePreindexDocsTest {
+class FullPreindexDocsTest {
Path countsFile;
Path wordsIdFile;
Path docsFile;
Path tempDir;
+ Path positionsFile;
TestJournalFactory journalFactory;
@@ -30,6 +34,7 @@ class ReversePreindexDocsTest {
wordsIdFile = Files.createTempFile("words", ".dat");
docsFile = Files.createTempFile("docs", ".dat");
tempDir = Files.createTempDirectory("sort");
+ positionsFile = tempDir.resolve("positions.dat");
}
@AfterEach
@@ -38,6 +43,9 @@ class ReversePreindexDocsTest {
Files.deleteIfExists(countsFile);
Files.deleteIfExists(wordsIdFile);
+ Files.deleteIfExists(positionsFile);
+ Files.deleteIfExists(docsFile);
+
List contents = new ArrayList<>();
Files.list(tempDir).forEach(contents::add);
for (var tempFile : contents) {
@@ -46,33 +54,9 @@ class ReversePreindexDocsTest {
Files.delete(tempDir);
}
- @Test
- public void testDocs() throws IOException {
- var reader = journalFactory.createReader(
- new EntryData(-0xF00BA3L, 0, 10, 40, -100, 33)
- );
-
- var segments = ReversePreindexWordSegments.construct(reader, wordsIdFile, countsFile);
- var docs = ReversePreindexDocuments.construct(docsFile, tempDir, reader, DocIdRewriter.identity(), segments);
-
- List expected = List.of(
- new TestSegmentData(-100, 0, 2, new long[] { -0xF00BA3L, 0 }),
- new TestSegmentData(10, 2, 4, new long[] { -0xF00BA3L, 0 }),
- new TestSegmentData(33, 4, 6, new long[] { -0xF00BA3L, 0 }),
- new TestSegmentData(40, 6, 8, new long[] { -0xF00BA3L, 0 })
- );
-
- List actual = new ArrayList<>();
-
- var iter = segments.iterator(2);
- while (iter.next()) {
- long[] data = new long[(int) (iter.endOffset - iter.startOffset)];
- docs.slice(iter.startOffset, iter.endOffset).get(0, data);
- actual.add(new TestSegmentData(iter.wordId, iter.startOffset, iter.endOffset,
- data));
- }
-
- assertEquals(expected, actual);
+ MurmurHash3_128 hash = new MurmurHash3_128();
+ long termId(String keyword) {
+ return hash.hashKeyword(keyword);
}
@Test
@@ -81,11 +65,13 @@ class ReversePreindexDocsTest {
new EntryData(-0xF00BA3L, 0, 4, 4)
);
- var segments = ReversePreindexWordSegments.construct(reader, wordsIdFile, countsFile);
- var docs = ReversePreindexDocuments.construct(docsFile, tempDir, reader, DocIdRewriter.identity(), segments);
+ var segments = FullPreindexWordSegments.construct(reader, wordsIdFile, countsFile);
+ var docs = FullPreindexDocuments.construct(docsFile, tempDir, reader, DocIdRewriter.identity(),
+ new PositionsFileConstructor(positionsFile),
+ segments);
List expected = List.of(
- new TestSegmentData(4, 0, 4, new long[] { -0xF00BA3L, 0, -0xF00BA3L, 0 })
+ new TestSegmentData(termId("4"), 0, 4, new long[] { -0xF00BA3L, 0, -0xF00BA3L, 0 })
);
List actual = new ArrayList<>();
@@ -100,6 +86,7 @@ class ReversePreindexDocsTest {
assertEquals(expected, actual);
}
+
@Test
public void testDocs2() throws IOException {
var reader = journalFactory.createReader(
@@ -107,8 +94,10 @@ class ReversePreindexDocsTest {
new EntryData(0xF00BA4L, 0, 15, 30, -100, 33)
);
- var segments = ReversePreindexWordSegments.construct(reader, wordsIdFile, countsFile);
- var docs = ReversePreindexDocuments.construct(docsFile, tempDir, reader, DocIdRewriter.identity(), segments);
+ var segments = FullPreindexWordSegments.construct(reader, wordsIdFile, countsFile);
+ var docs = FullPreindexDocuments.construct(docsFile, tempDir, reader, DocIdRewriter.identity(),
+ new PositionsFileConstructor(positionsFile),
+ segments);
List expected = List.of(
new TestSegmentData(-100, 0, 4, new long[] { -0xF00BA3L, 0, 0xF00BA4L, 0 }),
@@ -145,15 +134,15 @@ class ReversePreindexDocsTest {
if (wordId != that.wordId) return false;
if (start != that.start) return false;
if (end != that.end) return false;
- return Arrays.equals(data, that.data);
+ return data[0] == that.data[0]; //Arrays.equals(data, that.data);
}
@Override
public int hashCode() {
- int result = (int) (wordId ^ (wordId >>> 32));
- result = 31 * result + (int) (start ^ (start >>> 32));
- result = 31 * result + (int) (end ^ (end >>> 32));
- result = 31 * result + Arrays.hashCode(data);
+ int result = Long.hashCode(wordId);
+ result = 31 * result + Long.hashCode(start);
+ result = 31 * result + Long.hashCode(end);
+ result = 31 * result + Long.hashCode(data[0]);
return result;
}
diff --git a/code/index/index-reverse/test/nu/marginalia/index/construction/ReversePreindexFinalizeTest.java b/code/index/index-reverse/test/nu/marginalia/index/construction/full/FullPreindexFinalizeTest.java
similarity index 75%
rename from code/index/index-reverse/test/nu/marginalia/index/construction/ReversePreindexFinalizeTest.java
rename to code/index/index-reverse/test/nu/marginalia/index/construction/full/FullPreindexFinalizeTest.java
index 1ef2df4e..253e0d52 100644
--- a/code/index/index-reverse/test/nu/marginalia/index/construction/ReversePreindexFinalizeTest.java
+++ b/code/index/index-reverse/test/nu/marginalia/index/construction/full/FullPreindexFinalizeTest.java
@@ -1,9 +1,11 @@
-package nu.marginalia.index.construction;
+package nu.marginalia.index.construction.full;
import nu.marginalia.array.LongArrayFactory;
-import nu.marginalia.btree.BTreeReader;
import nu.marginalia.btree.model.BTreeHeader;
+import nu.marginalia.hash.MurmurHash3_128;
+import nu.marginalia.index.construction.DocIdRewriter;
+import nu.marginalia.index.construction.PositionsFileConstructor;
import org.junit.jupiter.api.AfterEach;
import org.junit.jupiter.api.BeforeEach;
import org.junit.jupiter.api.Test;
@@ -11,14 +13,17 @@ import org.junit.jupiter.api.Test;
import java.io.IOException;
import java.nio.file.Files;
import java.nio.file.Path;
-import java.util.*;
+import java.util.ArrayList;
+import java.util.List;
-import static nu.marginalia.index.construction.TestJournalFactory.*;
+import static nu.marginalia.index.construction.full.TestJournalFactory.EntryDataWithWordMeta;
+import static nu.marginalia.index.construction.full.TestJournalFactory.wm;
import static org.junit.jupiter.api.Assertions.assertEquals;
import static org.junit.jupiter.api.Assertions.assertTrue;
-class ReversePreindexFinalizeTest {
+class FullPreindexFinalizeTest {
TestJournalFactory journalFactory;
+ Path positionsFile;
Path countsFile;
Path wordsIdFile;
Path docsFile;
@@ -28,6 +33,7 @@ class ReversePreindexFinalizeTest {
public void setUp() throws IOException {
journalFactory = new TestJournalFactory();
+ positionsFile = Files.createTempFile("positions", ".dat");
countsFile = Files.createTempFile("counts", ".dat");
wordsIdFile = Files.createTempFile("words", ".dat");
docsFile = Files.createTempFile("docs", ".dat");
@@ -48,10 +54,17 @@ class ReversePreindexFinalizeTest {
Files.delete(tempDir);
}
+ MurmurHash3_128 hash = new MurmurHash3_128();
+ long termId(String keyword) {
+ return hash.hashKeyword(keyword);
+ }
+
@Test
public void testFinalizeSimple() throws IOException {
var reader = journalFactory.createReader(new EntryDataWithWordMeta(100, 101, wm(50, 51)));
- var preindex = ReversePreindex.constructPreindex(reader, DocIdRewriter.identity(), tempDir);
+ var preindex = FullPreindex.constructPreindex(reader,
+ new PositionsFileConstructor(positionsFile),
+ DocIdRewriter.identity(), tempDir);
preindex.finalizeIndex(tempDir.resolve( "docs.dat"), tempDir.resolve("words.dat"));
@@ -76,9 +89,7 @@ class ReversePreindexFinalizeTest {
assertEquals(1, wordsHeader.numEntries());
assertEquals(100, docsArray.get(docsHeader.dataOffsetLongs() + 0));
- assertEquals(51, docsArray.get(docsHeader.dataOffsetLongs() + 1));
- assertEquals(50, wordsArray.get(wordsHeader.dataOffsetLongs()));
- assertEquals(0, wordsArray.get(wordsHeader.dataOffsetLongs() + 1));
+ assertEquals(termId("50"), wordsArray.get(wordsHeader.dataOffsetLongs()));
}
@@ -89,7 +100,9 @@ class ReversePreindexFinalizeTest {
new EntryDataWithWordMeta(101, 101, wm(51, 52))
);
- var preindex = ReversePreindex.constructPreindex(reader, DocIdRewriter.identity(), tempDir);
+ var preindex = FullPreindex.constructPreindex(reader,
+ new PositionsFileConstructor(positionsFile),
+ DocIdRewriter.identity(), tempDir);
preindex.finalizeIndex(tempDir.resolve( "docs.dat"), tempDir.resolve("words.dat"));
preindex.delete();
@@ -116,10 +129,8 @@ class ReversePreindexFinalizeTest {
long offset1 = wordsArray.get(wordsHeader.dataOffsetLongs() + 1);
long offset2 = wordsArray.get(wordsHeader.dataOffsetLongs() + 3);
- assertEquals(50, wordsArray.get(wordsHeader.dataOffsetLongs()));
- assertEquals(0, wordsArray.get(wordsHeader.dataOffsetLongs() + 1));
- assertEquals(50, wordsArray.get(wordsHeader.dataOffsetLongs()));
- assertEquals(0, wordsArray.get(wordsHeader.dataOffsetLongs() + 1));
+ assertEquals(termId("50"), wordsArray.get(wordsHeader.dataOffsetLongs()));
+ assertEquals(termId("50"), wordsArray.get(wordsHeader.dataOffsetLongs()));
BTreeHeader docsHeader;
@@ -128,13 +139,11 @@ class ReversePreindexFinalizeTest {
assertEquals(1, docsHeader.numEntries());
assertEquals(100, docsArray.get(docsHeader.dataOffsetLongs() + 0));
- assertEquals(51, docsArray.get(docsHeader.dataOffsetLongs() + 1));
docsHeader = new BTreeHeader(docsArray, offset2);
System.out.println(docsHeader);
assertEquals(1, docsHeader.numEntries());
assertEquals(101, docsArray.get(docsHeader.dataOffsetLongs() + 0));
- assertEquals(52, docsArray.get(docsHeader.dataOffsetLongs() + 1));
}
}
\ No newline at end of file
diff --git a/code/index/index-reverse/test/nu/marginalia/index/construction/full/TestJournalFactory.java b/code/index/index-reverse/test/nu/marginalia/index/construction/full/TestJournalFactory.java
new file mode 100644
index 00000000..1be94b55
--- /dev/null
+++ b/code/index/index-reverse/test/nu/marginalia/index/construction/full/TestJournalFactory.java
@@ -0,0 +1,130 @@
+package nu.marginalia.index.construction.full;
+
+import nu.marginalia.index.journal.IndexJournalPage;
+import nu.marginalia.index.journal.IndexJournalSlopWriter;
+import nu.marginalia.model.processed.SlopDocumentRecord;
+import nu.marginalia.sequence.VarintCodedSequence;
+import nu.marginalia.test.TestUtil;
+
+import java.io.IOException;
+import java.nio.file.Files;
+import java.nio.file.Path;
+import java.util.Arrays;
+import java.util.List;
+import java.util.Objects;
+
+public class TestJournalFactory {
+ Path tempDir = Files.createTempDirectory("journal");
+
+ public TestJournalFactory() throws IOException {}
+
+ public void clear() throws IOException {
+ TestUtil.clearTempDir(tempDir);
+ }
+
+ public record EntryData(long docId, long docMeta, String... wordIds) {
+ public EntryData(long docId, long docMeta, long... wordIds) {
+ this(docId, docMeta, Arrays.stream(wordIds).mapToObj(String::valueOf).toArray(String[]::new));
+ }
+ @Override
+ public String toString() {
+ return "EntryData{" +
+ "docId=" + docId +
+ ", docMeta=" + docMeta +
+ ", wordIds=" + Arrays.toString(wordIds) +
+ '}';
+ }
+ }
+ public record EntryDataWithWordMeta(long docId, long docMeta, WordWithMeta... wordIds) {
+ @Override
+ public String toString() {
+ return "EntryDataWithWordMeta{" +
+ "docId=" + docId +
+ ", docMeta=" + docMeta +
+ ", wordIds=" + Arrays.toString(wordIds) +
+ '}';
+ }
+ }
+ public record WordWithMeta(String wordId, byte meta, VarintCodedSequence gcs) {
+ public WordWithMeta(long wordId, byte meta, VarintCodedSequence gcs) {
+ this(String.valueOf(wordId), meta, gcs);
+ }
+ }
+
+ public static WordWithMeta wm(long wordId, int meta, int... positions) {
+ return new WordWithMeta(wordId, (byte) meta, VarintCodedSequence.generate(positions));
+ }
+
+ public IndexJournalPage createReader(EntryData... entries) throws IOException {
+ Path ji = Files.createTempDirectory(tempDir, "journal");
+
+ var writer = new IndexJournalSlopWriter(ji, 0);
+ for (var entry : entries) {
+ String[] termIds = new String[entry.wordIds.length];
+ byte[] meta = new byte[entry.wordIds.length];
+
+ VarintCodedSequence[] positions = new VarintCodedSequence[entry.wordIds.length];
+ for (int i = 0; i < entry.wordIds.length; i++) {
+ termIds[i] = entry.wordIds[i];
+ meta[i] = 0;
+ positions[i] = VarintCodedSequence.generate();
+ }
+
+ writer.put(
+ entry.docId,
+ new SlopDocumentRecord.KeywordsProjection(
+ "test",
+ -1,
+ 0,
+ entry.docMeta,
+ 15,
+ Arrays.asList(termIds),
+ meta,
+ Arrays.asList(positions),
+ new byte[0],
+ List.of()
+ )
+ );
+ }
+ writer.close();
+
+ return new IndexJournalPage(ji, 0);
+ }
+
+ public IndexJournalPage createReader(EntryDataWithWordMeta... entries) throws IOException {
+ Path ji = Files.createTempDirectory(tempDir, "journal");
+
+ var writer = new IndexJournalSlopWriter(ji, 0);
+ for (var entry : entries) {
+
+ String[] termIds = new String[entry.wordIds.length];
+ byte[] meta = new byte[entry.wordIds.length];
+ VarintCodedSequence[] positions = new VarintCodedSequence[entry.wordIds.length];
+ for (int i = 0; i < entry.wordIds.length; i++) {
+ termIds[i] = entry.wordIds[i].wordId;
+ meta[i] = entry.wordIds[i].meta;
+ positions[i] = Objects.requireNonNullElseGet(entry.wordIds[i].gcs, VarintCodedSequence::generate);
+ }
+
+ writer.put(
+ entry.docId,
+ new SlopDocumentRecord.KeywordsProjection(
+ "test",
+ -1,
+ 0,
+ entry.docMeta,
+ 15,
+ Arrays.asList(termIds),
+ meta,
+ Arrays.asList(positions),
+ new byte[0],
+ List.of()
+ )
+ );
+
+ }
+ writer.close();
+
+ return new IndexJournalPage(ji, 0);
+ }
+}
diff --git a/code/index/index-reverse/test/nu/marginalia/index/construction/TestSegmentData.java b/code/index/index-reverse/test/nu/marginalia/index/construction/full/TestSegmentData.java
similarity index 80%
rename from code/index/index-reverse/test/nu/marginalia/index/construction/TestSegmentData.java
rename to code/index/index-reverse/test/nu/marginalia/index/construction/full/TestSegmentData.java
index 574bb61a..d325e029 100644
--- a/code/index/index-reverse/test/nu/marginalia/index/construction/TestSegmentData.java
+++ b/code/index/index-reverse/test/nu/marginalia/index/construction/full/TestSegmentData.java
@@ -1,9 +1,9 @@
-package nu.marginalia.index.construction;
+package nu.marginalia.index.construction.full;
import java.util.Arrays;
-record TestSegmentData(long wordId, long start, long end, long[] data) {
- public TestSegmentData(long wordId, long start, long end) {
+record TestSegmentData(String wordId, long start, long end, long[] data) {
+ public TestSegmentData(String wordId, long start, long end) {
this(wordId, start, end, null);
}
@@ -22,7 +22,7 @@ record TestSegmentData(long wordId, long start, long end, long[] data) {
@Override
public int hashCode() {
- int result = (int) (wordId ^ (wordId >>> 32));
+ int result = wordId.hashCode();
result = 31 * result + (int) (start ^ (start >>> 32));
result = 31 * result + (int) (end ^ (end >>> 32));
result = 31 * result + Arrays.hashCode(data);
diff --git a/code/index/index-reverse/test/nu/marginalia/index/construction/prio/PrioDocIdsTransformerTest.java b/code/index/index-reverse/test/nu/marginalia/index/construction/prio/PrioDocIdsTransformerTest.java
new file mode 100644
index 00000000..e4ced16d
--- /dev/null
+++ b/code/index/index-reverse/test/nu/marginalia/index/construction/prio/PrioDocIdsTransformerTest.java
@@ -0,0 +1,128 @@
+package nu.marginalia.index.construction.prio;
+
+import nu.marginalia.model.id.UrlIdCodec;
+import nu.marginalia.sequence.io.BitReader;
+import org.junit.jupiter.api.AfterEach;
+import org.junit.jupiter.api.BeforeEach;
+import org.junit.jupiter.api.Test;
+
+import java.io.DataInputStream;
+import java.io.DataOutputStream;
+import java.io.IOException;
+import java.nio.ByteBuffer;
+import java.nio.ByteOrder;
+import java.nio.channels.FileChannel;
+import java.nio.file.Files;
+import java.nio.file.Path;
+import java.nio.file.StandardOpenOption;
+
+import static org.junit.jupiter.api.Assertions.*;
+
+class PrioDocIdsTransformerTest {
+
+ Path inputFile = null;
+ Path outputFile = null;
+
+ @BeforeEach
+ public void setUp() throws IOException {
+ inputFile = Files.createTempFile("input", ".dat");
+ outputFile = Files.createTempFile("output", ".dat");
+ }
+
+ @AfterEach
+ public void tearDown() throws IOException {
+ if (inputFile != null) {
+ Files.deleteIfExists(inputFile);
+ }
+ if (outputFile != null) {
+ Files.deleteIfExists(outputFile);
+ }
+ }
+
+ @Test
+ public void testDomainIdDocOrd() throws IOException {
+
+
+ try (var writeChannel = (FileChannel) Files.newByteChannel(inputFile, StandardOpenOption.WRITE)) {
+ var buffer = ByteBuffer.allocate(128).order(ByteOrder.LITTLE_ENDIAN);
+
+ buffer.putLong(UrlIdCodec.encodeId(0, 0));
+ buffer.putLong(UrlIdCodec.encodeId(0, 1));
+ buffer.putLong(UrlIdCodec.encodeId(1, 0));
+ buffer.putLong(UrlIdCodec.encodeId(4, 51) | 0x7000_0000_0000_0000L);
+
+ writeChannel.write(buffer.flip());
+ }
+
+ try (var writeChannel = (FileChannel) Files.newByteChannel(outputFile, StandardOpenOption.WRITE);
+ var readChannel = (FileChannel) Files.newByteChannel(inputFile);
+ var transformer = new PrioDocIdsTransformer(writeChannel, readChannel))
+ {
+ // Transform two segments of the input file and write them to the output file with prefixed sizes
+
+ transformer.transform(0, 4);
+ }
+
+ byte[] bytes = Files.readAllBytes(outputFile);
+ var buffer = ByteBuffer.wrap(bytes);
+
+
+ BitReader reader = new BitReader(buffer);
+
+ // read the header
+ {
+ int code = reader.get(2);
+ int size = reader.get(30);
+ assertEquals(3, code);
+ assertEquals(4, size);
+ }
+
+ // read first doc id in parts
+ int rank = reader.get(7);
+ int domainId = reader.get(31);
+ int ordinal = reader.get(26);
+
+ assertEquals(0, rank);
+ assertEquals(0, domainId);
+ assertEquals(0, ordinal);
+
+ {
+ int code = reader.get(2);
+ assertEquals(0, code); // increment doc ordinal
+
+ int dord = reader.getGamma();
+ ordinal += dord;
+
+ assertEquals(1, ordinal);
+ }
+
+ {
+ int code = reader.get(2);
+ assertEquals(1, code); // increment doc ordinal
+
+ int diffDomainId = reader.getDelta();
+ domainId += diffDomainId;
+ assertEquals(1, domainId);
+
+ int abs_ord = reader.getDelta();
+ ordinal = abs_ord - 1;
+ assertEquals(0, ordinal);
+ }
+
+ {
+ int code = reader.get(2);
+ assertEquals(2, code); // increment doc ordinal
+
+ int diffRank = reader.getGamma();
+ rank += diffRank;
+ assertEquals(56, rank);
+
+ domainId = reader.get(31);
+ ordinal = reader.get(26);
+
+ assertEquals(4, domainId);
+ assertEquals(51, ordinal);
+ }
+ }
+
+}
\ No newline at end of file
diff --git a/code/index/index-reverse/test/nu/marginalia/index/construction/prio/PrioPreindexTest.java b/code/index/index-reverse/test/nu/marginalia/index/construction/prio/PrioPreindexTest.java
new file mode 100644
index 00000000..6075fa8a
--- /dev/null
+++ b/code/index/index-reverse/test/nu/marginalia/index/construction/prio/PrioPreindexTest.java
@@ -0,0 +1,185 @@
+package nu.marginalia.index.construction.prio;
+
+import nu.marginalia.array.page.LongQueryBuffer;
+import nu.marginalia.hash.MurmurHash3_128;
+import nu.marginalia.index.PrioReverseIndexReader;
+import nu.marginalia.index.construction.DocIdRewriter;
+import nu.marginalia.index.construction.full.TestJournalFactory;
+import nu.marginalia.model.id.UrlIdCodec;
+import org.junit.jupiter.api.AfterEach;
+import org.junit.jupiter.api.Assertions;
+import org.junit.jupiter.api.BeforeEach;
+import org.junit.jupiter.api.Test;
+
+import java.io.IOException;
+import java.nio.file.Files;
+import java.nio.file.Path;
+import java.util.ArrayList;
+import java.util.List;
+import java.util.Random;
+
+import static nu.marginalia.index.construction.full.TestJournalFactory.EntryDataWithWordMeta;
+import static nu.marginalia.index.construction.full.TestJournalFactory.wm;
+import static org.junit.jupiter.api.Assertions.assertEquals;
+import static org.junit.jupiter.api.Assertions.assertTrue;
+
+class PrioPreindexTest {
+ Path countsFile;
+ Path wordsIdFile;
+ Path docsFile;
+ Path tempDir;
+ Path positionsFile;
+
+ TestJournalFactory journalFactory;
+
+ @BeforeEach
+ public void setUp() throws IOException {
+ journalFactory = new TestJournalFactory();
+
+ countsFile = Files.createTempFile("counts", ".dat");
+ wordsIdFile = Files.createTempFile("words", ".dat");
+ docsFile = Files.createTempFile("docs", ".dat");
+ tempDir = Files.createTempDirectory("sort");
+ positionsFile = tempDir.resolve("positions.dat");
+ }
+
+ @AfterEach
+ public void tearDown() throws IOException {
+ journalFactory.clear();
+
+ Files.deleteIfExists(countsFile);
+ Files.deleteIfExists(wordsIdFile);
+ Files.deleteIfExists(positionsFile);
+ Files.deleteIfExists(docsFile);
+
+ List contents = new ArrayList<>();
+ Files.list(tempDir).forEach(contents::add);
+ for (var tempFile : contents) {
+ Files.delete(tempFile);
+ }
+ Files.delete(tempDir);
+ }
+
+ MurmurHash3_128 hash = new MurmurHash3_128();
+ long termId(String keyword) {
+ return hash.hashKeyword(keyword);
+ }
+
+ @Test
+ public void testFinalizeSimple() throws IOException {
+ var journalReader = journalFactory.createReader(
+ new EntryDataWithWordMeta(100, 101, wm(50, 51)),
+ new EntryDataWithWordMeta(104, 101, wm(50, 52)),
+ new EntryDataWithWordMeta(106, 101, wm(50, 52))
+ );
+
+ var preindex = PrioPreindex.constructPreindex(journalReader, DocIdRewriter.identity(), tempDir);
+ preindex.finalizeIndex(tempDir.resolve( "docs.dat"), tempDir.resolve("words.dat"));
+ preindex.delete();
+
+ Path wordsFile = tempDir.resolve("words.dat");
+ Path docsFile = tempDir.resolve("docs.dat");
+
+ assertTrue(Files.exists(wordsFile));
+ assertTrue(Files.exists(docsFile));
+
+ var indexReader = new PrioReverseIndexReader("test", wordsFile, docsFile);
+
+ var entrySource = indexReader.documents(termId("50"));
+ var lqb = new LongQueryBuffer(32);
+ entrySource.read(lqb);
+
+ assertEquals(3, lqb.size());
+ assertEquals(100, lqb.copyData()[0]);
+ assertEquals(104, lqb.copyData()[1]);
+ assertEquals(106, lqb.copyData()[2]);
+ }
+
+ @Test
+ public void testFinalizeLargeData() throws IOException {
+ int rankComponent = 0;
+ int domainComponent = 0;
+ int docOrdinal = 0;
+ var random = new Random();
+ long[] documentIds = new long[10000];
+
+ for (int i = 0; i < documentIds.length; i++) {
+ int scenario = random.nextInt(0, 3);
+
+ // Avoid going into scenario 3 when we've already reached max rank
+ // instead fall back into scenario 0 as this should be the more common
+ // of the two
+ if (rankComponent == 63 && scenario == 2) {
+ scenario = 0;
+ }
+
+ if (scenario == 0) {
+ docOrdinal += random.nextInt(1, 100);
+ } else if (scenario == 1) {
+ domainComponent+=random.nextInt(1, 1000);
+ docOrdinal=random.nextInt(0, 10000);
+ } else {
+ rankComponent = Math.min(63, rankComponent + random.nextInt(1, 2));
+ domainComponent=random.nextInt(0, 10000);
+ docOrdinal=random.nextInt(0, 10000);
+ }
+
+ documentIds[i] = UrlIdCodec.encodeId(rankComponent, domainComponent, docOrdinal);
+ }
+
+ EntryDataWithWordMeta[] entries = new EntryDataWithWordMeta[documentIds.length];
+ for (int i = 0; i < documentIds.length; i++) {
+ entries[i] = new EntryDataWithWordMeta(documentIds[i], 101, wm(50, 51));
+ }
+ var journalReader = journalFactory.createReader(entries);
+
+ var preindex = PrioPreindex.constructPreindex(journalReader, DocIdRewriter.identity(), tempDir);
+ preindex.finalizeIndex(tempDir.resolve( "docs.dat"), tempDir.resolve("words.dat"));
+ preindex.delete();
+
+ Path wordsFile = tempDir.resolve("words.dat");
+ Path docsFile = tempDir.resolve("docs.dat");
+
+ assertTrue(Files.exists(wordsFile));
+ assertTrue(Files.exists(docsFile));
+
+ var indexReader = new PrioReverseIndexReader("test", wordsFile, docsFile);
+
+ int items = indexReader.numDocuments(termId("50"));
+ assertEquals(documentIds.length, items);
+
+ var entrySource = indexReader.documents(termId("50"));
+ var lqb = new LongQueryBuffer(32);
+
+ for (int pos = 0; pos < documentIds.length;) {
+ if (!entrySource.hasMore()) {
+ Assertions.fail("Out of data @ " + pos);
+ }
+
+ entrySource.read(lqb);
+
+ var dataArray = lqb.copyData();
+ for (int i = 0; i < lqb.size(); i++) {
+
+ long currValue = dataArray[i];
+
+ if (documentIds[i + pos] != currValue) {
+ System.out.println("Mismatch at position " + (i + pos));
+
+ long prevValue = documentIds[i + pos - 1];
+ long expectedValue = documentIds[i + pos];
+
+ System.out.println("Prev: " + prevValue + " -> " + UrlIdCodec.getRank(prevValue) + " " + UrlIdCodec.getDomainId(prevValue) + " " + UrlIdCodec.getDocumentOrdinal(prevValue));
+ System.out.println("Curr: " + currValue + " -> " + UrlIdCodec.getRank(currValue) + " " + UrlIdCodec.getDomainId(currValue) + " " + UrlIdCodec.getDocumentOrdinal(currValue));
+ System.out.println("Exp: " + expectedValue + " -> " + UrlIdCodec.getRank(expectedValue) + " " + UrlIdCodec.getDomainId(expectedValue) + " " + UrlIdCodec.getDocumentOrdinal(expectedValue));
+
+ assertTrue(currValue > prevValue, "Current value is not greater than previous value");
+
+ Assertions.fail();
+ }
+ }
+ pos += lqb.size();
+ }
+
+ }
+}
\ No newline at end of file
diff --git a/code/index/index-reverse/test/nu/marginalia/test/TestUtil.java b/code/index/index-reverse/test/nu/marginalia/test/TestUtil.java
deleted file mode 100644
index 8fbf6b54..00000000
--- a/code/index/index-reverse/test/nu/marginalia/test/TestUtil.java
+++ /dev/null
@@ -1,43 +0,0 @@
-package nu.marginalia.test;
-
-
-import java.io.File;
-import java.io.IOException;
-import java.nio.file.Files;
-import java.nio.file.Path;
-import java.util.Arrays;
-
-public class TestUtil {
- public static void clearTempDir(Path dir) {
- if (Files.isDirectory(dir)) {
- for (File f : dir.toFile().listFiles()) {
- File[] files = f.listFiles();
- if (files != null) {
- Arrays.stream(files).map(File::toPath).forEach(TestUtil::clearTempDir);
- }
- System.out.println("Deleting " + f + " (" + fileSize(f.toPath()) + ")");
- f.delete();
- }
- }
- System.out.println("Deleting " + dir);
- dir.toFile().delete();
- }
-
- private static String fileSize(Path path) {
- try {
- long sizeBytes = Files.size(path);
-
- if (sizeBytes > 1024 * 1024 * 1024) return round(sizeBytes / 1073741824.) + "Gb";
- if (sizeBytes > 1024 * 1024) return round(sizeBytes / 1048576.) + "Mb";
- if (sizeBytes > 1024) return round(sizeBytes / 1024.) + "Kb";
- return sizeBytes + "b";
- }
- catch (IOException ex) {
- throw new RuntimeException(ex);
- }
- }
-
- private static String round(double d) {
- return String.format("%.2f", d);
- }
-}
diff --git a/code/index/java/nu/marginalia/index/IndexFactory.java b/code/index/java/nu/marginalia/index/IndexFactory.java
index a1d2f5a5..e388793f 100644
--- a/code/index/java/nu/marginalia/index/IndexFactory.java
+++ b/code/index/java/nu/marginalia/index/IndexFactory.java
@@ -3,12 +3,11 @@ package nu.marginalia.index;
import com.google.inject.Inject;
import com.google.inject.Singleton;
import nu.marginalia.IndexLocations;
-import nu.marginalia.index.index.CombinedIndexReader;
-import nu.marginalia.storage.FileStorageService;
import nu.marginalia.index.forward.ForwardIndexFileNames;
import nu.marginalia.index.forward.ForwardIndexReader;
-import org.slf4j.Logger;
-import org.slf4j.LoggerFactory;
+import nu.marginalia.index.index.CombinedIndexReader;
+import nu.marginalia.index.positions.PositionsFileReader;
+import nu.marginalia.storage.FileStorageService;
import java.io.IOException;
import java.nio.file.Files;
@@ -39,16 +38,16 @@ public class IndexFactory {
return IndexLocations.getSearchSetsPath(fileStorageService);
}
- public ReverseIndexReader getReverseIndexReader() throws IOException {
-
- return new ReverseIndexReader("full",
+ public FullReverseIndexReader getReverseIndexReader() throws IOException {
+ return new FullReverseIndexReader("full",
ReverseIndexFullFileNames.resolve(liveStorage, ReverseIndexFullFileNames.FileIdentifier.WORDS, ReverseIndexFullFileNames.FileVersion.CURRENT),
- ReverseIndexFullFileNames.resolve(liveStorage, ReverseIndexFullFileNames.FileIdentifier.DOCS, ReverseIndexFullFileNames.FileVersion.CURRENT)
+ ReverseIndexFullFileNames.resolve(liveStorage, ReverseIndexFullFileNames.FileIdentifier.DOCS, ReverseIndexFullFileNames.FileVersion.CURRENT),
+ new PositionsFileReader(ReverseIndexFullFileNames.resolve(liveStorage, ReverseIndexFullFileNames.FileIdentifier.POSITIONS, ReverseIndexFullFileNames.FileVersion.CURRENT))
);
}
- public ReverseIndexReader getReverseIndexPrioReader() throws IOException {
- return new ReverseIndexReader("prio",
+ public PrioReverseIndexReader getReverseIndexPrioReader() throws IOException {
+ return new PrioReverseIndexReader("prio",
ReverseIndexPrioFileNames.resolve(liveStorage, ReverseIndexPrioFileNames.FileIdentifier.WORDS, ReverseIndexPrioFileNames.FileVersion.CURRENT),
ReverseIndexPrioFileNames.resolve(liveStorage, ReverseIndexPrioFileNames.FileIdentifier.DOCS, ReverseIndexPrioFileNames.FileVersion.CURRENT)
);
@@ -57,7 +56,8 @@ public class IndexFactory {
public ForwardIndexReader getForwardIndexReader() throws IOException {
return new ForwardIndexReader(
ForwardIndexFileNames.resolve(liveStorage, ForwardIndexFileNames.FileIdentifier.DOC_ID, ForwardIndexFileNames.FileVersion.CURRENT),
- ForwardIndexFileNames.resolve(liveStorage, ForwardIndexFileNames.FileIdentifier.DOC_DATA, ForwardIndexFileNames.FileVersion.CURRENT)
+ ForwardIndexFileNames.resolve(liveStorage, ForwardIndexFileNames.FileIdentifier.DOC_DATA, ForwardIndexFileNames.FileVersion.CURRENT),
+ ForwardIndexFileNames.resolve(liveStorage, ForwardIndexFileNames.FileIdentifier.SPANS_DATA, ForwardIndexFileNames.FileVersion.CURRENT)
);
}
diff --git a/code/index/java/nu/marginalia/index/IndexGrpcService.java b/code/index/java/nu/marginalia/index/IndexGrpcService.java
index 1c430014..81172a5b 100644
--- a/code/index/java/nu/marginalia/index/IndexGrpcService.java
+++ b/code/index/java/nu/marginalia/index/IndexGrpcService.java
@@ -8,23 +8,26 @@ import io.prometheus.client.Gauge;
import io.prometheus.client.Histogram;
import it.unimi.dsi.fastutil.longs.LongArrayList;
import lombok.SneakyThrows;
-import nu.marginalia.api.searchquery.*;
+import nu.marginalia.api.searchquery.IndexApiGrpc;
+import nu.marginalia.api.searchquery.RpcDecoratedResultItem;
+import nu.marginalia.api.searchquery.RpcIndexQuery;
import nu.marginalia.api.searchquery.model.compiled.CompiledQuery;
import nu.marginalia.api.searchquery.model.compiled.CompiledQueryLong;
import nu.marginalia.api.searchquery.model.compiled.CqDataInt;
import nu.marginalia.api.searchquery.model.query.SearchSpecification;
-import nu.marginalia.api.searchquery.model.results.*;
+import nu.marginalia.api.searchquery.model.results.ResultRankingContext;
+import nu.marginalia.api.searchquery.model.results.ResultRankingParameters;
import nu.marginalia.array.page.LongQueryBuffer;
import nu.marginalia.index.index.StatefulIndex;
import nu.marginalia.index.model.SearchParameters;
import nu.marginalia.index.model.SearchTerms;
import nu.marginalia.index.query.IndexQuery;
import nu.marginalia.index.query.IndexSearchBudget;
-import nu.marginalia.index.results.IndexResultValuatorService;
+import nu.marginalia.index.results.IndexResultRankingService;
import nu.marginalia.index.results.model.ids.CombinedDocIdList;
+import nu.marginalia.index.searchset.SearchSet;
import nu.marginalia.index.searchset.SearchSetsService;
import nu.marginalia.index.searchset.SmallSearchSet;
-import nu.marginalia.index.searchset.SearchSet;
import nu.marginalia.service.module.ServiceConfiguration;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
@@ -32,7 +35,8 @@ import org.slf4j.Marker;
import org.slf4j.MarkerFactory;
import java.sql.SQLException;
-import java.util.*;
+import java.util.BitSet;
+import java.util.List;
import java.util.concurrent.ArrayBlockingQueue;
import java.util.concurrent.Executor;
import java.util.concurrent.Executors;
@@ -81,7 +85,7 @@ public class IndexGrpcService extends IndexApiGrpc.IndexApiImplBase {
private final StatefulIndex statefulIndex;
private final SearchSetsService searchSetsService;
- private final IndexResultValuatorService resultValuator;
+ private final IndexResultRankingService resultValuator;
private final String nodeName;
@@ -91,7 +95,7 @@ public class IndexGrpcService extends IndexApiGrpc.IndexApiImplBase {
public IndexGrpcService(ServiceConfiguration serviceConfiguration,
StatefulIndex statefulIndex,
SearchSetsService searchSetsService,
- IndexResultValuatorService resultValuator)
+ IndexResultRankingService resultValuator)
{
var nodeId = serviceConfiguration.node();
this.nodeName = Integer.toString(nodeId);
@@ -110,11 +114,17 @@ public class IndexGrpcService extends IndexApiGrpc.IndexApiImplBase {
long endTime = System.currentTimeMillis() + request.getQueryLimits().getTimeoutMs();
- SearchResultSet results = wmsa_query_time
+ List results = wmsa_query_time
.labels(nodeName, "GRPC")
.time(() -> {
// Perform the search
- return executeSearch(params);
+ try {
+ return executeSearch(params);
+ }
+ catch (Exception ex) {
+ logger.error("Error in handling request", ex);
+ return List.of();
+ }
});
// Prometheus bookkeeping
@@ -129,47 +139,8 @@ public class IndexGrpcService extends IndexApiGrpc.IndexApiImplBase {
}
// Send the results back to the client
- for (var result : results.results) {
-
- var rawResult = result.rawIndexResult;
-
- var rawItem = RpcRawResultItem.newBuilder();
- rawItem.setCombinedId(rawResult.combinedId);
- rawItem.setResultsFromDomain(rawResult.resultsFromDomain);
- rawItem.setHtmlFeatures(rawResult.htmlFeatures);
- rawItem.setEncodedDocMetadata(rawResult.encodedDocMetadata);
- rawItem.setHasPriorityTerms(rawResult.hasPrioTerm);
-
- for (var score : rawResult.keywordScores) {
- rawItem.addKeywordScores(
- RpcResultKeywordScore.newBuilder()
- .setEncodedWordMetadata(score.encodedWordMetadata())
- .setKeyword(score.keyword)
- );
- }
-
- var decoratedBuilder = RpcDecoratedResultItem.newBuilder()
- .setDataHash(result.dataHash)
- .setDescription(result.description)
- .setFeatures(result.features)
- .setFormat(result.format)
- .setRankingScore(result.rankingScore)
- .setTitle(result.title)
- .setUrl(result.url.toString())
- .setUrlQuality(result.urlQuality)
- .setWordsTotal(result.wordsTotal)
- .setBestPositions(result.bestPositions)
- .setRawItem(rawItem);
-
- var rankingDetails = IndexProtobufCodec.convertRankingDetails(result.rankingDetails);
- if (rankingDetails != null) {
- decoratedBuilder.setRankingDetails(rankingDetails);
- }
-
- if (result.pubYear != null) {
- decoratedBuilder.setPubYear(result.pubYear);
- }
- responseObserver.onNext(decoratedBuilder.build());
+ for (var result : results) {
+ responseObserver.onNext(result);
}
responseObserver.onCompleted();
@@ -183,7 +154,7 @@ public class IndexGrpcService extends IndexApiGrpc.IndexApiImplBase {
// exists for test access
@SneakyThrows
- SearchResultSet justQuery(SearchSpecification specsSet) {
+ List justQuery(SearchSpecification specsSet) {
return executeSearch(new SearchParameters(specsSet, getSearchSet(specsSet)));
}
@@ -205,11 +176,12 @@ public class IndexGrpcService extends IndexApiGrpc.IndexApiImplBase {
return searchSetsService.getSearchSetByName(request.getSearchSetIdentifier());
}
- private SearchResultSet executeSearch(SearchParameters params) throws SQLException, InterruptedException {
+ // accessible for tests
+ public List executeSearch(SearchParameters params) throws SQLException, InterruptedException {
if (!statefulIndex.isLoaded()) {
// Short-circuit if the index is not loaded, as we trivially know that there can be no results
- return new SearchResultSet(List.of());
+ return List.of();
}
ResultRankingContext rankingContext = createRankingContext(params.rankingParams,
@@ -218,7 +190,7 @@ public class IndexGrpcService extends IndexApiGrpc.IndexApiImplBase {
var queryExecution = new QueryExecution(rankingContext, params.fetchSize);
- var ret = queryExecution.run(params);
+ List ret = queryExecution.run(params);
wmsa_index_query_exec_block_time
.labels(nodeName)
@@ -230,30 +202,69 @@ public class IndexGrpcService extends IndexApiGrpc.IndexApiImplBase {
return ret;
}
+ /** This class is responsible for ranking the results and adding the best results to the
+ * resultHeap, which depending on the state of the indexLookup threads may or may not block
+ */
+ private ResultRankingContext createRankingContext(ResultRankingParameters rankingParams,
+ CompiledQuery compiledQuery,
+ CompiledQueryLong compiledQueryIds)
+ {
+
+ int[] full = new int[compiledQueryIds.size()];
+ int[] prio = new int[compiledQueryIds.size()];
+
+ BitSet ngramsMask = new BitSet(compiledQuery.size());
+ BitSet regularMask = new BitSet(compiledQuery.size());
+
+ var currentIndex = statefulIndex.get();
+
+ for (int idx = 0; idx < compiledQueryIds.size(); idx++) {
+ long id = compiledQueryIds.at(idx);
+ full[idx] = currentIndex.numHits(id);
+ prio[idx] = currentIndex.numHitsPrio(id);
+
+ if (compiledQuery.at(idx).contains("_")) {
+ ngramsMask.set(idx);
+ }
+ else {
+ regularMask.set(idx);
+ }
+ }
+
+ return new ResultRankingContext(currentIndex.totalDocCount(),
+ rankingParams,
+ ngramsMask,
+ regularMask,
+ new CqDataInt(full),
+ new CqDataInt(prio));
+ }
+
/** This class is responsible for executing a search query. It uses a thread pool to
* execute the subqueries and their valuation in parallel. The results are then combined
* into a bounded priority queue, and finally the best results are returned.
*/
private class QueryExecution {
+
private static final Executor workerPool = Executors.newWorkStealingPool(indexValuationThreads*4);
/** The queue where the results from the index lookup threads are placed,
* pending ranking by the result ranker threads */
private final ArrayBlockingQueue resultCandidateQueue
= new ArrayBlockingQueue<>(8);
-
private final ResultPriorityQueue resultHeap;
+
private final ResultRankingContext resultRankingContext;
-
private final AtomicInteger remainingIndexTasks = new AtomicInteger(0);
- private final AtomicInteger remainingValuationTasks = new AtomicInteger(0);
+ private final AtomicInteger remainingValuationTasks = new AtomicInteger(0);
private final AtomicLong blockTime = new AtomicLong(0);
+
private final AtomicLong stallTime = new AtomicLong(0);
public long getStallTime() {
return stallTime.get();
}
+
public long getBlockTime() {
return blockTime.get();
}
@@ -264,7 +275,7 @@ public class IndexGrpcService extends IndexApiGrpc.IndexApiImplBase {
}
/** Execute a search query */
- public SearchResultSet run(SearchParameters parameters) throws SQLException, InterruptedException {
+ public List run(SearchParameters parameters) throws SQLException, InterruptedException {
var terms = new SearchTerms(parameters.query, parameters.compiledQueryIds);
@@ -281,10 +292,7 @@ public class IndexGrpcService extends IndexApiGrpc.IndexApiImplBase {
awaitCompletion();
// Return the best results
- return new SearchResultSet(
- resultValuator.selectBestResults(parameters,
- resultRankingContext,
- resultHeap));
+ return resultValuator.selectBestResults(parameters, resultRankingContext, resultHeap);
}
/** Wait for all tasks to complete */
@@ -295,12 +303,12 @@ public class IndexGrpcService extends IndexApiGrpc.IndexApiImplBase {
}
}
}
-
/** This class is responsible for executing a subquery and adding the results to the
* resultCandidateQueue, which depending on the state of the valuator threads may
* or may not block */
class IndexLookup implements Runnable {
private final IndexQuery query;
+
private final IndexSearchBudget budget;
IndexLookup(IndexQuery query,
@@ -315,6 +323,9 @@ public class IndexGrpcService extends IndexApiGrpc.IndexApiImplBase {
try {
executeSearch();
}
+ catch (Exception ex) {
+ logger.error("Error in index lookup", ex);
+ }
finally {
synchronized (remainingIndexTasks) {
if (remainingIndexTasks.decrementAndGet() == 0) {
@@ -325,10 +336,10 @@ public class IndexGrpcService extends IndexApiGrpc.IndexApiImplBase {
}
private void executeSearch() {
- final LongArrayList results = new LongArrayList(512);
+ final LongArrayList results = new LongArrayList(64);
// These queries are different indices for one subquery
- final LongQueryBuffer buffer = new LongQueryBuffer(512);
+ final LongQueryBuffer buffer = new LongQueryBuffer(64);
while (query.hasMore() && budget.hasTimeLeft())
{
@@ -339,7 +350,7 @@ public class IndexGrpcService extends IndexApiGrpc.IndexApiImplBase {
results.add(buffer.data.get(i));
}
- if (results.size() < 512) {
+ if (results.size() >= 64) {
enqueueResults(new CombinedDocIdList(results));
results.clear();
}
@@ -366,13 +377,11 @@ public class IndexGrpcService extends IndexApiGrpc.IndexApiImplBase {
logger.warn("Interrupted while waiting to offer resultIds to queue", e);
}
}
- }
- /** This class is responsible for ranking the results and adding the best results to the
- * resultHeap, which depending on the state of the indexLookup threads may or may not block
- */
+ }
class ResultRanker implements Runnable {
private final SearchParameters parameters;
+
private final ResultRankingContext rankingContext;
ResultRanker(SearchParameters parameters, ResultRankingContext rankingContext) {
@@ -415,49 +424,16 @@ public class IndexGrpcService extends IndexApiGrpc.IndexApiImplBase {
stallTime.addAndGet(System.currentTimeMillis() - start);
resultHeap.addAll(
- resultValuator.rankResults(parameters, rankingContext, resultIds)
+ resultValuator.rankResults(parameters, false, rankingContext, resultIds)
);
}
return true; // keep going
}
+
}
}
- private ResultRankingContext createRankingContext(ResultRankingParameters rankingParams,
- CompiledQuery compiledQuery,
- CompiledQueryLong compiledQueryIds)
- {
-
- int[] full = new int[compiledQueryIds.size()];
- int[] prio = new int[compiledQueryIds.size()];
-
- BitSet ngramsMask = new BitSet(compiledQuery.size());
- BitSet regularMask = new BitSet(compiledQuery.size());
-
- var currentIndex = statefulIndex.get();
-
- for (int idx = 0; idx < compiledQueryIds.size(); idx++) {
- long id = compiledQueryIds.at(idx);
- full[idx] = currentIndex.numHits(id);
- prio[idx] = currentIndex.numHitsPrio(id);
-
- if (compiledQuery.at(idx).contains("_")) {
- ngramsMask.set(idx);
- }
- else {
- regularMask.set(idx);
- }
- }
-
- return new ResultRankingContext(currentIndex.totalDocCount(),
- rankingParams,
- ngramsMask,
- regularMask,
- new CqDataInt(full),
- new CqDataInt(prio));
- }
-
}
diff --git a/code/index/java/nu/marginalia/index/index/CombinedIndexReader.java b/code/index/java/nu/marginalia/index/index/CombinedIndexReader.java
index afc52094..216192cf 100644
--- a/code/index/java/nu/marginalia/index/index/CombinedIndexReader.java
+++ b/code/index/java/nu/marginalia/index/index/CombinedIndexReader.java
@@ -5,8 +5,10 @@ import it.unimi.dsi.fastutil.longs.LongList;
import it.unimi.dsi.fastutil.longs.LongOpenHashSet;
import it.unimi.dsi.fastutil.longs.LongSet;
import nu.marginalia.api.searchquery.model.compiled.aggregate.CompiledQueryAggregates;
-import nu.marginalia.index.ReverseIndexReader;
+import nu.marginalia.index.FullReverseIndexReader;
+import nu.marginalia.index.PrioReverseIndexReader;
import nu.marginalia.index.forward.ForwardIndexReader;
+import nu.marginalia.index.forward.spans.DocumentSpans;
import nu.marginalia.index.model.QueryParams;
import nu.marginalia.index.model.SearchTerms;
import nu.marginalia.index.query.IndexQuery;
@@ -14,12 +16,13 @@ import nu.marginalia.index.query.IndexQueryBuilder;
import nu.marginalia.index.query.filter.QueryFilterStepIf;
import nu.marginalia.index.query.limit.SpecificationLimitType;
import nu.marginalia.index.results.model.ids.CombinedDocIdList;
-import nu.marginalia.index.results.model.ids.DocMetadataList;
+import nu.marginalia.index.results.model.ids.TermMetadataList;
import nu.marginalia.model.id.UrlIdCodec;
import nu.marginalia.model.idx.DocumentMetadata;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
+import java.lang.foreign.Arena;
import java.time.Duration;
import java.util.ArrayList;
import java.util.Collections;
@@ -37,30 +40,25 @@ public class CombinedIndexReader {
private final Logger logger = LoggerFactory.getLogger(getClass());
private final ForwardIndexReader forwardIndexReader;
- private final ReverseIndexReader reverseIndexFullReader;
- private final ReverseIndexReader reverseIndexPriorityReader;
+ private final FullReverseIndexReader reverseIndexFullReader;
+ private final PrioReverseIndexReader reverseIndexPriorityReader;
public CombinedIndexReader(ForwardIndexReader forwardIndexReader,
- ReverseIndexReader reverseIndexFullReader,
- ReverseIndexReader reverseIndexPriorityReader) {
+ FullReverseIndexReader reverseIndexFullReader,
+ PrioReverseIndexReader reverseIndexPriorityReader) {
this.forwardIndexReader = forwardIndexReader;
this.reverseIndexFullReader = reverseIndexFullReader;
this.reverseIndexPriorityReader = reverseIndexPriorityReader;
}
public IndexQueryBuilderImpl newQueryBuilder(IndexQuery query) {
- return new IndexQueryBuilderImpl(reverseIndexFullReader, reverseIndexPriorityReader, query);
+ return new IndexQueryBuilderImpl(reverseIndexFullReader, query);
}
public QueryFilterStepIf hasWordFull(long termId) {
return reverseIndexFullReader.also(termId);
}
- public QueryFilterStepIf hasWordPrio(long termId) {
- return reverseIndexPriorityReader.also(termId);
- }
-
-
/** Creates a query builder for terms in the priority index */
public IndexQueryBuilder findPriorityWord(long wordId) {
return newQueryBuilder(new IndexQuery(reverseIndexPriorityReader.documents(wordId)))
@@ -113,17 +111,28 @@ public class CombinedIndexReader {
return 0;
});
- var head = findFullWord(elements.getLong(0));
- for (int i = 1; i < elements.size(); i++) {
- head.addInclusionFilter(hasWordFull(elements.getLong(i)));
+ if (!SearchTerms.stopWords.contains(elements.getLong(0))) {
+ var head = findFullWord(elements.getLong(0));
+
+ for (int i = 1; i < elements.size(); i++) {
+ long termId = elements.getLong(i);
+
+ // if a stop word is present in the query, skip the step of requiring it to be in the document,
+ // we'll assume it's there and save IO
+ if (SearchTerms.stopWords.contains(termId)) {
+ continue;
+ }
+
+ head.addInclusionFilter(hasWordFull(termId));
+ }
+ queryHeads.add(head);
}
- queryHeads.add(head);
// If there are few paths, we can afford to check the priority index as well
if (paths.size() < 4) {
var prioHead = findPriorityWord(elements.getLong(0));
for (int i = 1; i < elements.size(); i++) {
- prioHead.addInclusionFilter(hasWordPrio(elements.getLong(i)));
+ prioHead.addInclusionFilter(hasWordFull(elements.getLong(i)));
}
queryHeads.add(prioHead);
}
@@ -169,8 +178,11 @@ public class CombinedIndexReader {
}
/** Retrieves the term metadata for the specified word for the provided documents */
- public DocMetadataList getMetadata(long wordId, CombinedDocIdList docIds) {
- return new DocMetadataList(reverseIndexFullReader.getTermMeta(wordId, docIds.array()));
+ public TermMetadataList getTermMetadata(Arena arena,
+ long wordId,
+ CombinedDocIdList docIds)
+ {
+ return new TermMetadataList(reverseIndexFullReader.getTermData(arena, wordId, docIds.array()));
}
/** Retrieves the document metadata for the specified document */
@@ -188,6 +200,16 @@ public class CombinedIndexReader {
return forwardIndexReader.getHtmlFeatures(docId);
}
+ /** Retrieves the HTML features for the specified document */
+ public int getDocumentSize(long docId) {
+ return forwardIndexReader.getDocumentSize(docId);
+ }
+
+ /** Retrieves the document spans for the specified document */
+ public DocumentSpans getDocumentSpans(Arena arena, long docId) {
+ return forwardIndexReader.getDocumentSpans(arena, docId);
+ }
+
/** Close the indexes (this is not done immediately)
* */
public void close() throws InterruptedException {
diff --git a/code/index/java/nu/marginalia/index/index/IndexQueryBuilderImpl.java b/code/index/java/nu/marginalia/index/index/IndexQueryBuilderImpl.java
index 0f63fdbc..abdbc836 100644
--- a/code/index/java/nu/marginalia/index/index/IndexQueryBuilderImpl.java
+++ b/code/index/java/nu/marginalia/index/index/IndexQueryBuilderImpl.java
@@ -2,7 +2,7 @@ package nu.marginalia.index.index;
import java.util.List;
import gnu.trove.set.hash.TLongHashSet;
-import nu.marginalia.index.ReverseIndexReader;
+import nu.marginalia.index.FullReverseIndexReader;
import nu.marginalia.index.query.IndexQuery;
import nu.marginalia.index.query.IndexQueryBuilder;
import nu.marginalia.index.query.filter.QueryFilterAnyOf;
@@ -10,8 +10,7 @@ import nu.marginalia.index.query.filter.QueryFilterStepIf;
public class IndexQueryBuilderImpl implements IndexQueryBuilder {
private final IndexQuery query;
- private final ReverseIndexReader reverseIndexFullReader;
- private final ReverseIndexReader reverseIndexPrioReader;
+ private final FullReverseIndexReader reverseIndexFullReader;
/* Keep track of already added include terms to avoid redundant checks.
*
@@ -21,13 +20,10 @@ public class IndexQueryBuilderImpl implements IndexQueryBuilder {
* */
private final TLongHashSet alreadyConsideredTerms = new TLongHashSet();
- IndexQueryBuilderImpl(ReverseIndexReader reverseIndexFullReader,
- ReverseIndexReader reverseIndexPrioReader,
- IndexQuery query)
+ IndexQueryBuilderImpl(FullReverseIndexReader reverseIndexFullReader, IndexQuery query)
{
this.query = query;
this.reverseIndexFullReader = reverseIndexFullReader;
- this.reverseIndexPrioReader = reverseIndexPrioReader;
}
public IndexQueryBuilder withSourceTerms(long... sourceTerms) {
diff --git a/code/index/java/nu/marginalia/index/index/StatefulIndex.java b/code/index/java/nu/marginalia/index/index/StatefulIndex.java
index 7da5f74b..41c398bf 100644
--- a/code/index/java/nu/marginalia/index/index/StatefulIndex.java
+++ b/code/index/java/nu/marginalia/index/index/StatefulIndex.java
@@ -90,7 +90,7 @@ public class StatefulIndex {
return combinedIndexReader != null;
}
- /** Stronger version of isAvailable() that also checks that the index is loaded */
+ /** Stronger page of isAvailable() that also checks that the index is loaded */
public boolean isLoaded() {
return combinedIndexReader != null && combinedIndexReader.isLoaded();
}
diff --git a/code/index/java/nu/marginalia/index/model/SearchTerms.java b/code/index/java/nu/marginalia/index/model/SearchTerms.java
index 8115c109..2a475754 100644
--- a/code/index/java/nu/marginalia/index/model/SearchTerms.java
+++ b/code/index/java/nu/marginalia/index/model/SearchTerms.java
@@ -1,21 +1,26 @@
package nu.marginalia.index.model;
import it.unimi.dsi.fastutil.longs.LongArrayList;
+import it.unimi.dsi.fastutil.longs.LongArraySet;
import it.unimi.dsi.fastutil.longs.LongComparator;
import it.unimi.dsi.fastutil.longs.LongList;
import nu.marginalia.api.searchquery.model.compiled.CompiledQueryLong;
import nu.marginalia.api.searchquery.model.query.SearchQuery;
-import java.util.ArrayList;
-import java.util.List;
-
import static nu.marginalia.index.model.SearchTermsUtil.getWordId;
public final class SearchTerms {
private final LongList advice;
private final LongList excludes;
private final LongList priority;
- private final List coherences;
+
+ public static final LongArraySet stopWords = new LongArraySet(
+ new long[] {
+ getWordId("a"),
+ getWordId("an"),
+ getWordId("the"),
+ }
+ );
private final CompiledQueryLong compiledQueryIds;
@@ -24,7 +29,7 @@ public final class SearchTerms {
{
this.excludes = new LongArrayList();
this.priority = new LongArrayList();
- this.coherences = new ArrayList<>();
+
this.advice = new LongArrayList();
this.compiledQueryIds = compiledQueryIds;
@@ -32,16 +37,6 @@ public final class SearchTerms {
advice.add(getWordId(word));
}
- for (var coherence : query.searchTermCoherences) {
- LongList parts = new LongArrayList(coherence.size());
-
- for (var word : coherence) {
- parts.add(getWordId(word));
- }
-
- coherences.add(parts);
- }
-
for (var word : query.searchTermsExclude) {
excludes.add(getWordId(word));
}
@@ -72,10 +67,6 @@ public final class SearchTerms {
return priority;
}
- public List coherences() {
- return coherences;
- }
-
public CompiledQueryLong compiledQuery() { return compiledQueryIds; }
}
diff --git a/code/index/java/nu/marginalia/index/results/Bm25GraphVisitor.java b/code/index/java/nu/marginalia/index/results/Bm25GraphVisitor.java
new file mode 100644
index 00000000..95b665ff
--- /dev/null
+++ b/code/index/java/nu/marginalia/index/results/Bm25GraphVisitor.java
@@ -0,0 +1,93 @@
+package nu.marginalia.index.results;
+
+import nu.marginalia.api.searchquery.model.compiled.CqDataInt;
+import nu.marginalia.api.searchquery.model.compiled.CqExpression;
+import nu.marginalia.api.searchquery.model.results.Bm25Parameters;
+import nu.marginalia.api.searchquery.model.results.ResultRankingContext;
+
+import java.util.BitSet;
+import java.util.List;
+
+/** Visitor for calculating the best BM25 score for a graph representing a search query
+ */
+public class Bm25GraphVisitor implements CqExpression.DoubleVisitor {
+ private static final long AVG_LENGTH = 5000;
+
+ private final float[] counts;
+ private final CqDataInt frequencies;
+
+ private final double k1;
+ private final double b;
+
+ private final int docCount;
+ private final int length;
+
+ private final BitSet mask;
+
+ public Bm25GraphVisitor(Bm25Parameters bm25Parameters,
+ float[] counts,
+ int length,
+ ResultRankingContext ctx) {
+ this.length = length;
+
+ this.k1 = bm25Parameters.k();
+ this.b = bm25Parameters.b();
+
+ this.docCount = ctx.termFreqDocCount();
+ this.counts = counts;
+ this.frequencies = ctx.fullCounts;
+ this.mask = ctx.regularMask;
+ }
+
+ @Override
+ public double onAnd(List extends CqExpression> parts) {
+ double value = 0;
+
+ for (var part : parts) {
+ value += part.visit(this);
+ }
+
+ return value;
+ }
+
+ @Override
+ public double onOr(List extends CqExpression> parts) {
+ double value = 0;
+ for (var part : parts) {
+ value = Math.max(value, part.visit(this));
+ }
+ return value;
+ }
+
+ @Override
+ public double onLeaf(int idx) {
+ if (!mask.get(idx)) {
+ return 0;
+ }
+
+ double count = counts[idx];
+ int freq = frequencies.get(idx);
+
+ return invFreq(docCount, freq) * f(count, length);
+ }
+
+ /**
+ *
+ * @param docCount Number of documents
+ * @param freq Number of matching documents
+ */
+ private double invFreq(int docCount, int freq) {
+ return Math.log(1.0 + (docCount - freq + 0.5) / (freq + 0.5));
+ }
+
+ /**
+ *
+ * @param count number of occurrences in the document
+ * @param length document length
+ */
+ private double f(double count, int length) {
+ final double lengthRatio = (double) length / AVG_LENGTH;
+
+ return (count * (k1 + 1)) / (count + k1 * (1 - b + b * lengthRatio));
+ }
+}
diff --git a/code/index/java/nu/marginalia/index/results/IndexMetadataService.java b/code/index/java/nu/marginalia/index/results/IndexMetadataService.java
deleted file mode 100644
index d068c0f4..00000000
--- a/code/index/java/nu/marginalia/index/results/IndexMetadataService.java
+++ /dev/null
@@ -1,91 +0,0 @@
-package nu.marginalia.index.results;
-
-import com.google.inject.Inject;
-import gnu.trove.map.hash.TObjectLongHashMap;
-import it.unimi.dsi.fastutil.longs.Long2ObjectArrayMap;
-import it.unimi.dsi.fastutil.longs.LongArrayList;
-import nu.marginalia.api.searchquery.model.compiled.CompiledQuery;
-import nu.marginalia.api.searchquery.model.query.SearchQuery;
-import nu.marginalia.index.index.StatefulIndex;
-import nu.marginalia.index.model.SearchTermsUtil;
-import nu.marginalia.index.results.model.QuerySearchTerms;
-import nu.marginalia.index.results.model.TermCoherenceGroupList;
-import nu.marginalia.index.results.model.TermMetadataForCombinedDocumentIds;
-import nu.marginalia.index.results.model.ids.CombinedDocIdList;
-import nu.marginalia.index.results.model.ids.TermIdList;
-
-import static nu.marginalia.index.results.model.TermCoherenceGroupList.TermCoherenceGroup;
-import static nu.marginalia.index.results.model.TermMetadataForCombinedDocumentIds.DocumentsWithMetadata;
-
-public class IndexMetadataService {
- private final StatefulIndex statefulIndex;
-
- @Inject
- public IndexMetadataService(StatefulIndex index) {
- this.statefulIndex = index;
- }
-
- public TermMetadataForCombinedDocumentIds getTermMetadataForDocuments(CombinedDocIdList combinedIdsAll,
- TermIdList termIdsList)
- {
- var currentIndex = statefulIndex.get();
-
- Long2ObjectArrayMap termdocToMeta =
- new Long2ObjectArrayMap<>(termIdsList.size());
-
- for (long termId : termIdsList.array()) {
- var metadata = currentIndex.getMetadata(termId, combinedIdsAll);
-
- termdocToMeta.put(termId,
- new DocumentsWithMetadata(combinedIdsAll, metadata));
- }
-
- return new TermMetadataForCombinedDocumentIds(termdocToMeta);
- }
-
- public QuerySearchTerms getSearchTerms(CompiledQuery compiledQuery, SearchQuery searchQuery) {
-
- LongArrayList termIdsList = new LongArrayList();
- LongArrayList termIdsPrio = new LongArrayList();
-
- TObjectLongHashMap