(index-reverse) Split index construction into separate packages for full and priority index

This commit is contained in:
Viktor Lofgren 2024-07-06 15:44:47 +02:00
parent a4ecd5f4ce
commit 85c99ae808
24 changed files with 1006 additions and 139 deletions

View File

@ -1,4 +1,4 @@
package nu.marginalia.index.construction;
package nu.marginalia.index.construction.full;
import nu.marginalia.array.LongArray;
import nu.marginalia.array.algo.LongArrayTransformations;
@ -9,7 +9,7 @@ import java.io.IOException;
import java.nio.channels.FileChannel;
/** Constructs the BTrees in a reverse index */
public class ReverseIndexBTreeTransformer implements LongArrayTransformations.LongIOTransformer {
public class FullIndexBTreeTransformer implements LongArrayTransformations.LongIOTransformer {
private final BTreeWriter writer;
private final FileChannel intermediateChannel;
@ -18,10 +18,10 @@ public class ReverseIndexBTreeTransformer implements LongArrayTransformations.Lo
long start = 0;
long writeOffset = 0;
public ReverseIndexBTreeTransformer(LongArray urlsFileMap,
int entrySize,
BTreeContext bTreeContext,
FileChannel intermediateChannel) {
public FullIndexBTreeTransformer(LongArray urlsFileMap,
int entrySize,
BTreeContext bTreeContext,
FileChannel intermediateChannel) {
this.writer = new BTreeWriter(urlsFileMap, bTreeContext);
this.entrySize = entrySize;
this.intermediateChannel = intermediateChannel;

View File

@ -1,6 +1,9 @@
package nu.marginalia.index.construction;
package nu.marginalia.index.construction.full;
import lombok.SneakyThrows;
import nu.marginalia.index.construction.DocIdRewriter;
import nu.marginalia.index.construction.JournalReaderSource;
import nu.marginalia.index.construction.PositionsFileConstructor;
import nu.marginalia.process.control.ProcessHeartbeat;
import nu.marginalia.index.journal.IndexJournalFileNames;
import org.slf4j.Logger;
@ -10,9 +13,9 @@ import java.io.IOException;
import java.nio.file.Path;
import java.util.concurrent.atomic.AtomicInteger;
public class ReverseIndexConstructor {
public class FullIndexConstructor {
private static final Logger logger = LoggerFactory.getLogger(ReverseIndexConstructor.class);
private static final Logger logger = LoggerFactory.getLogger(FullIndexConstructor.class);
public enum CreateReverseIndexSteps {
CONSTRUCT,
@ -27,12 +30,12 @@ public class ReverseIndexConstructor {
private final DocIdRewriter docIdRewriter;
private final Path tmpDir;
public ReverseIndexConstructor(Path outputFileDocs,
Path outputFileWords,
Path outputFilePositions,
JournalReaderSource readerSource,
DocIdRewriter docIdRewriter,
Path tmpDir) {
public FullIndexConstructor(Path outputFileDocs,
Path outputFileWords,
Path outputFilePositions,
JournalReaderSource readerSource,
DocIdRewriter docIdRewriter,
Path tmpDir) {
this.outputFileDocs = outputFileDocs;
this.outputFileWords = outputFileWords;
this.outputFilePositions = outputFilePositions;
@ -77,20 +80,20 @@ public class ReverseIndexConstructor {
}
@SneakyThrows
private ReversePreindexReference construct(Path input, PositionsFileConstructor positionsFileConstructor) {
return ReversePreindex
private FullPreindexReference construct(Path input, PositionsFileConstructor positionsFileConstructor) {
return FullPreindex
.constructPreindex(readerSource.construct(input), positionsFileConstructor, docIdRewriter, tmpDir)
.closeToReference();
}
@SneakyThrows
private ReversePreindexReference merge(ReversePreindexReference leftR, ReversePreindexReference rightR) {
private FullPreindexReference merge(FullPreindexReference leftR, FullPreindexReference rightR) {
var left = leftR.open();
var right = rightR.open();
try {
return ReversePreindex.merge(tmpDir, left, right).closeToReference();
return FullPreindex.merge(tmpDir, left, right).closeToReference();
}
finally {
left.delete();
@ -101,7 +104,7 @@ public class ReverseIndexConstructor {
}
@SneakyThrows
private void finalizeIndex(ReversePreindexReference finalPR) {
private void finalizeIndex(FullPreindexReference finalPR) {
var finalP = finalPR.open();
finalP.finalizeIndex(outputFileDocs, outputFileWords);
finalP.delete();

View File

@ -1,9 +1,13 @@
package nu.marginalia.index.construction;
package nu.marginalia.index.construction.full;
import nu.marginalia.array.LongArray;
import nu.marginalia.array.LongArrayFactory;
import nu.marginalia.btree.BTreeWriter;
import nu.marginalia.index.ReverseIndexParameters;
import nu.marginalia.index.construction.CountToOffsetTransformer;
import nu.marginalia.index.construction.DocIdRewriter;
import nu.marginalia.index.construction.IndexSizeEstimator;
import nu.marginalia.index.construction.PositionsFileConstructor;
import nu.marginalia.index.journal.reader.IndexJournalReader;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
@ -25,13 +29,13 @@ import static nu.marginalia.array.algo.TwoArrayOperations.*;
* the union of their data. This operation requires no additional
* RAM.
*/
public class ReversePreindex {
final ReversePreindexWordSegments segments;
final ReversePreindexDocuments documents;
public class FullPreindex {
final FullPreindexWordSegments segments;
final FullPreindexDocuments documents;
private static final Logger logger = LoggerFactory.getLogger(ReversePreindex.class);
private static final Logger logger = LoggerFactory.getLogger(FullPreindex.class);
public ReversePreindex(ReversePreindexWordSegments segments, ReversePreindexDocuments documents) {
public FullPreindex(FullPreindexWordSegments segments, FullPreindexDocuments documents) {
this.segments = segments;
this.documents = documents;
}
@ -39,27 +43,27 @@ public class ReversePreindex {
/** Constructs a new preindex with the data associated with reader. The backing files
* will have randomly assigned names.
*/
public static ReversePreindex constructPreindex(IndexJournalReader reader,
PositionsFileConstructor positionsFileConstructor,
DocIdRewriter docIdRewriter,
Path workDir) throws IOException
public static FullPreindex constructPreindex(IndexJournalReader reader,
PositionsFileConstructor positionsFileConstructor,
DocIdRewriter docIdRewriter,
Path workDir) throws IOException
{
Path segmentWordsFile = Files.createTempFile(workDir, "segment_words", ".dat");
Path segmentCountsFile = Files.createTempFile(workDir, "segment_counts", ".dat");
Path docsFile = Files.createTempFile(workDir, "docs", ".dat");
var segments = ReversePreindexWordSegments.construct(reader, segmentWordsFile, segmentCountsFile);
var docs = ReversePreindexDocuments.construct(docsFile, workDir, reader, docIdRewriter, positionsFileConstructor, segments);
return new ReversePreindex(segments, docs);
var segments = FullPreindexWordSegments.construct(reader, segmentWordsFile, segmentCountsFile);
var docs = FullPreindexDocuments.construct(docsFile, workDir, reader, docIdRewriter, positionsFileConstructor, segments);
return new FullPreindex(segments, docs);
}
/** Close the associated memory mapped areas and return
* a dehydrated version of this object that can be re-opened
* later.
*/
public ReversePreindexReference closeToReference() {
public FullPreindexReference closeToReference() {
try {
return new ReversePreindexReference(segments, documents);
return new FullPreindexReference(segments, documents);
}
finally {
segments.force();
@ -85,7 +89,7 @@ public class ReversePreindex {
LongArray finalDocs = LongArrayFactory.mmapForWritingConfined(outputFileDocs, sizeEstimator.size);
try (var intermediateDocChannel = documents.createDocumentsFileChannel()) {
offsets.transformEachIO(0, offsets.size(),
new ReverseIndexBTreeTransformer(finalDocs, 2,
new FullIndexBTreeTransformer(finalDocs, 2,
ReverseIndexParameters.docsBTreeContext,
intermediateDocChannel));
intermediateDocChannel.force(false);
@ -126,11 +130,11 @@ public class ReversePreindex {
documents.delete();
}
public static ReversePreindex merge(Path destDir,
ReversePreindex left,
ReversePreindex right) throws IOException {
public static FullPreindex merge(Path destDir,
FullPreindex left,
FullPreindex right) throws IOException {
ReversePreindexWordSegments mergingSegment =
FullPreindexWordSegments mergingSegment =
createMergedSegmentWordFile(destDir, left.segments, right.segments);
var mergingIter = mergingSegment.constructionIterator(2);
@ -198,18 +202,18 @@ public class ReversePreindex {
mergedDocuments = shrinkMergedDocuments(mergedDocuments,
docsFile, 2 * mergingSegment.totalSize());
return new ReversePreindex(
return new FullPreindex(
mergingSegment,
new ReversePreindexDocuments(mergedDocuments, docsFile)
new FullPreindexDocuments(mergedDocuments, docsFile)
);
}
/** Create a segment word file with each word from both inputs, with zero counts for all the data.
* This is an intermediate product in merging.
*/
static ReversePreindexWordSegments createMergedSegmentWordFile(Path destDir,
ReversePreindexWordSegments left,
ReversePreindexWordSegments right) throws IOException {
static FullPreindexWordSegments createMergedSegmentWordFile(Path destDir,
FullPreindexWordSegments left,
FullPreindexWordSegments right) throws IOException {
Path segmentWordsFile = Files.createTempFile(destDir, "segment_words", ".dat");
Path segmentCountsFile = Files.createTempFile(destDir, "segment_counts", ".dat");
@ -228,7 +232,7 @@ public class ReversePreindex {
LongArray counts = LongArrayFactory.mmapForWritingConfined(segmentCountsFile, segmentsSize);
return new ReversePreindexWordSegments(wordIdsFile, counts, segmentWordsFile, segmentCountsFile);
return new FullPreindexWordSegments(wordIdsFile, counts, segmentWordsFile, segmentCountsFile);
}
/** It's possible we overestimated the necessary size of the documents file,
@ -256,12 +260,12 @@ public class ReversePreindex {
/** Merge contents of the segments indicated by leftIter and rightIter into the destionation
* segment, and advance the construction iterator with the appropriate size.
*/
private static void mergeSegments(ReversePreindexWordSegments.SegmentIterator leftIter,
ReversePreindexWordSegments.SegmentIterator rightIter,
ReversePreindexDocuments left,
ReversePreindexDocuments right,
private static void mergeSegments(FullPreindexWordSegments.SegmentIterator leftIter,
FullPreindexWordSegments.SegmentIterator rightIter,
FullPreindexDocuments left,
FullPreindexDocuments right,
LongArray dest,
ReversePreindexWordSegments.SegmentConstructionIterator destIter)
FullPreindexWordSegments.SegmentConstructionIterator destIter)
{
long segSize = mergeArrays2(dest,
left.documents,
@ -279,10 +283,10 @@ public class ReversePreindex {
/** Copy the data from the source segment at the position and length indicated by sourceIter,
* into the destination segment, and advance the construction iterator.
*/
private static boolean copySegment(ReversePreindexWordSegments.SegmentIterator sourceIter,
LongArray dest,
FileChannel sourceChannel,
ReversePreindexWordSegments.SegmentConstructionIterator mergingIter) throws IOException {
private static boolean copySegment(FullPreindexWordSegments.SegmentIterator sourceIter,
LongArray dest,
FileChannel sourceChannel,
FullPreindexWordSegments.SegmentConstructionIterator mergingIter) throws IOException {
long size = sourceIter.endOffset - sourceIter.startOffset;
long start = mergingIter.startOffset;

View File

@ -1,8 +1,10 @@
package nu.marginalia.index.construction;
package nu.marginalia.index.construction.full;
import lombok.SneakyThrows;
import nu.marginalia.array.LongArray;
import nu.marginalia.array.LongArrayFactory;
import nu.marginalia.index.construction.DocIdRewriter;
import nu.marginalia.index.construction.PositionsFileConstructor;
import nu.marginalia.index.journal.reader.IndexJournalReader;
import nu.marginalia.rwf.RandomFileAssembler;
import org.slf4j.Logger;
@ -20,35 +22,35 @@ import java.util.concurrent.TimeUnit;
/** A LongArray with document data, segmented according to
* the associated ReversePreindexWordSegments data
*/
public class ReversePreindexDocuments {
public class FullPreindexDocuments {
public final LongArray documents;
private static PositionsFileConstructor positionsFileConstructor;
private static final int RECORD_SIZE_LONGS = 2;
private static final Logger logger = LoggerFactory.getLogger(ReversePreindexDocuments.class);
private static final Logger logger = LoggerFactory.getLogger(FullPreindexDocuments.class);
public final Path file;
public ReversePreindexDocuments(LongArray documents, Path file) {
public FullPreindexDocuments(LongArray documents, Path file) {
this.documents = documents;
this.file = file;
}
public static ReversePreindexDocuments construct(
public static FullPreindexDocuments construct(
Path docsFile,
Path workDir,
IndexJournalReader reader,
DocIdRewriter docIdRewriter,
PositionsFileConstructor positionsFileConstructor,
ReversePreindexWordSegments segments) throws IOException {
ReversePreindexDocuments.positionsFileConstructor = positionsFileConstructor;
FullPreindexWordSegments segments) throws IOException {
FullPreindexDocuments.positionsFileConstructor = positionsFileConstructor;
createUnsortedDocsFile(docsFile, workDir, reader, segments, docIdRewriter);
LongArray docsFileMap = LongArrayFactory.mmapForModifyingShared(docsFile);
sortDocsFile(docsFileMap, segments);
return new ReversePreindexDocuments(docsFileMap, docsFile);
return new FullPreindexDocuments(docsFileMap, docsFile);
}
public FileChannel createDocumentsFileChannel() throws IOException {
@ -67,7 +69,7 @@ public class ReversePreindexDocuments {
private static void createUnsortedDocsFile(Path docsFile,
Path workDir,
IndexJournalReader reader,
ReversePreindexWordSegments segments,
FullPreindexWordSegments segments,
DocIdRewriter docIdRewriter) throws IOException {
long fileSizeLongs = RECORD_SIZE_LONGS * segments.totalSize();
@ -99,7 +101,7 @@ public class ReversePreindexDocuments {
}
@SneakyThrows
private static void sortDocsFile(LongArray docsFileMap, ReversePreindexWordSegments segments) throws IOException {
private static void sortDocsFile(LongArray docsFileMap, FullPreindexWordSegments segments) throws IOException {
var iter = segments.iterator(RECORD_SIZE_LONGS);

View File

@ -1,33 +1,33 @@
package nu.marginalia.index.construction;
package nu.marginalia.index.construction.full;
import nu.marginalia.array.LongArrayFactory;
import java.io.IOException;
import java.nio.file.Path;
/** This is a dehydrated version of a ReversePreIndex, that only
/** This is a dehydrated version of a FullPreIndex, that only
* keeps references to its location on disk but does not hold associated
* memory maps.
*/
public record ReversePreindexReference(
public record FullPreindexReference(
Path wordsFile,
Path countsFile,
Path documentsFile
)
{
public ReversePreindexReference(ReversePreindexWordSegments segments, ReversePreindexDocuments documents) {
public FullPreindexReference(FullPreindexWordSegments segments, FullPreindexDocuments documents) {
this(segments.wordsFile, segments.countsFile, documents.file);
}
public ReversePreindex open() throws IOException {
return new ReversePreindex(
new ReversePreindexWordSegments(
public FullPreindex open() throws IOException {
return new FullPreindex(
new FullPreindexWordSegments(
LongArrayFactory.mmapForModifyingShared(wordsFile),
LongArrayFactory.mmapForModifyingShared(countsFile),
wordsFile,
countsFile
),
new ReversePreindexDocuments(
new FullPreindexDocuments(
LongArrayFactory.mmapForModifyingShared(documentsFile),
documentsFile
)

View File

@ -1,4 +1,4 @@
package nu.marginalia.index.construction;
package nu.marginalia.index.construction.full;
import it.unimi.dsi.fastutil.longs.Long2IntOpenHashMap;
import it.unimi.dsi.fastutil.longs.Long2LongOpenHashMap;
@ -14,17 +14,17 @@ import java.nio.file.Path;
/** A pair of file-backed arrays of sorted wordIds
* and the count of documents associated with each termId.
*/
public class ReversePreindexWordSegments {
public class FullPreindexWordSegments {
public final LongArray wordIds;
public final LongArray counts;
final Path wordsFile;
final Path countsFile;
public ReversePreindexWordSegments(LongArray wordIds,
LongArray counts,
Path wordsFile,
Path countsFile)
public FullPreindexWordSegments(LongArray wordIds,
LongArray counts,
Path wordsFile,
Path countsFile)
{
assert wordIds.size() == counts.size();
@ -51,9 +51,9 @@ public class ReversePreindexWordSegments {
return ret;
}
public static ReversePreindexWordSegments construct(IndexJournalReader reader,
Path wordIdsFile,
Path countsFile)
public static FullPreindexWordSegments construct(IndexJournalReader reader,
Path wordIdsFile,
Path countsFile)
throws IOException
{
Long2IntOpenHashMap countsMap = new Long2IntOpenHashMap(100_000, 0.75f);
@ -79,7 +79,7 @@ public class ReversePreindexWordSegments {
counts.set(i, countsMap.get(words.get(i)));
}
return new ReversePreindexWordSegments(words, counts, wordIdsFile, countsFile);
return new FullPreindexWordSegments(words, counts, wordIdsFile, countsFile);
}
public SegmentIterator iterator(int recordSize) {

View File

@ -0,0 +1,48 @@
package nu.marginalia.index.construction.prio;
import nu.marginalia.array.LongArray;
import nu.marginalia.array.algo.LongArrayTransformations;
import nu.marginalia.btree.BTreeWriter;
import nu.marginalia.btree.model.BTreeContext;
import java.io.IOException;
import java.nio.channels.FileChannel;
/** Constructs the BTrees in a reverse index */
public class PrioIndexBTreeTransformer implements LongArrayTransformations.LongIOTransformer {
private final BTreeWriter writer;
private final FileChannel intermediateChannel;
private final int entrySize;
long start = 0;
long writeOffset = 0;
public PrioIndexBTreeTransformer(LongArray urlsFileMap,
int entrySize,
BTreeContext bTreeContext,
FileChannel intermediateChannel) {
this.writer = new BTreeWriter(urlsFileMap, bTreeContext);
this.entrySize = entrySize;
this.intermediateChannel = intermediateChannel;
}
@Override
public long transform(long pos, long end) throws IOException {
final int size = (int) ((end - start) / entrySize);
if (size == 0) {
return -1;
}
final long offsetForBlock = writeOffset;
writeOffset += writer.write(writeOffset, size,
mapRegion -> mapRegion.transferFrom(intermediateChannel, start, 0, end - start)
);
start = end;
return offsetForBlock;
}
}

View File

@ -0,0 +1,114 @@
package nu.marginalia.index.construction.prio;
import lombok.SneakyThrows;
import nu.marginalia.index.construction.DocIdRewriter;
import nu.marginalia.index.construction.JournalReaderSource;
import nu.marginalia.index.construction.PositionsFileConstructor;
import nu.marginalia.index.journal.IndexJournalFileNames;
import nu.marginalia.process.control.ProcessHeartbeat;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import java.io.IOException;
import java.nio.file.Path;
import java.util.concurrent.atomic.AtomicInteger;
public class PrioIndexConstructor {
private static final Logger logger = LoggerFactory.getLogger(PrioIndexConstructor.class);
public enum CreateReverseIndexSteps {
CONSTRUCT,
FINALIZE,
FINISHED
}
private final Path outputFileDocs;
private final Path outputFileWords;
private final Path outputFilePositions;
private final JournalReaderSource readerSource;
private final DocIdRewriter docIdRewriter;
private final Path tmpDir;
public PrioIndexConstructor(Path outputFileDocs,
Path outputFileWords,
Path outputFilePositions,
JournalReaderSource readerSource,
DocIdRewriter docIdRewriter,
Path tmpDir) {
this.outputFileDocs = outputFileDocs;
this.outputFileWords = outputFileWords;
this.outputFilePositions = outputFilePositions;
this.readerSource = readerSource;
this.docIdRewriter = docIdRewriter;
this.tmpDir = tmpDir;
}
public void createReverseIndex(ProcessHeartbeat processHeartbeat,
String processName,
Path sourceBaseDir) throws IOException
{
var inputs = IndexJournalFileNames.findJournalFiles(sourceBaseDir);
if (inputs.isEmpty()) {
logger.error("No journal files in base dir {}", sourceBaseDir);
return;
}
try (var heartbeat = processHeartbeat.createProcessTaskHeartbeat(CreateReverseIndexSteps.class, processName);
var preindexHeartbeat = processHeartbeat.createAdHocTaskHeartbeat("constructPreindexes");
var posConstructor = new PositionsFileConstructor(outputFilePositions)
) {
heartbeat.progress(CreateReverseIndexSteps.CONSTRUCT);
AtomicInteger progress = new AtomicInteger(0);
inputs
.parallelStream()
.map(in -> {
preindexHeartbeat.progress("PREINDEX/MERGE", progress.incrementAndGet(), inputs.size());
return construct(in, posConstructor);
})
.reduce(this::merge)
.ifPresent((index) -> {
heartbeat.progress(CreateReverseIndexSteps.FINALIZE);
finalizeIndex(index);
heartbeat.progress(CreateReverseIndexSteps.FINISHED);
});
heartbeat.progress(CreateReverseIndexSteps.FINISHED);
}
}
@SneakyThrows
private PrioPreindexReference construct(Path input, PositionsFileConstructor positionsFileConstructor) {
return PrioPreindex
.constructPreindex(readerSource.construct(input), positionsFileConstructor, docIdRewriter, tmpDir)
.closeToReference();
}
@SneakyThrows
private PrioPreindexReference merge(PrioPreindexReference leftR, PrioPreindexReference rightR) {
var left = leftR.open();
var right = rightR.open();
try {
return PrioPreindex.merge(tmpDir, left, right).closeToReference();
}
finally {
left.delete();
right.delete();
}
}
@SneakyThrows
private void finalizeIndex(PrioPreindexReference finalPR) {
var finalP = finalPR.open();
finalP.finalizeIndex(outputFileDocs, outputFileWords);
finalP.delete();
}
}

View File

@ -0,0 +1,310 @@
package nu.marginalia.index.construction.prio;
import nu.marginalia.array.LongArray;
import nu.marginalia.array.LongArrayFactory;
import nu.marginalia.btree.BTreeWriter;
import nu.marginalia.index.ReverseIndexParameters;
import nu.marginalia.index.construction.CountToOffsetTransformer;
import nu.marginalia.index.construction.DocIdRewriter;
import nu.marginalia.index.construction.IndexSizeEstimator;
import nu.marginalia.index.construction.PositionsFileConstructor;
import nu.marginalia.index.journal.reader.IndexJournalReader;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import java.io.IOException;
import java.nio.channels.FileChannel;
import java.nio.file.Files;
import java.nio.file.Path;
import java.nio.file.StandardOpenOption;
import static nu.marginalia.array.algo.TwoArrayOperations.*;
/** Contains the data that would go into a reverse index,
* that is, a mapping from words to documents, minus the actual
* index structure that makes the data quick to access while
* searching.
* <p>
* Two preindexes can be merged into a third preindex containing
* the union of their data. This operation requires no additional
* RAM.
*/
public class PrioPreindex {
final PrioPreindexWordSegments segments;
final PrioPreindexDocuments documents;
private static final Logger logger = LoggerFactory.getLogger(PrioPreindex.class);
public PrioPreindex(PrioPreindexWordSegments segments, PrioPreindexDocuments documents) {
this.segments = segments;
this.documents = documents;
}
/** Constructs a new preindex with the data associated with reader. The backing files
* will have randomly assigned names.
*/
public static PrioPreindex constructPreindex(IndexJournalReader reader,
PositionsFileConstructor positionsFileConstructor,
DocIdRewriter docIdRewriter,
Path workDir) throws IOException
{
Path segmentWordsFile = Files.createTempFile(workDir, "segment_words", ".dat");
Path segmentCountsFile = Files.createTempFile(workDir, "segment_counts", ".dat");
Path docsFile = Files.createTempFile(workDir, "docs", ".dat");
var segments = PrioPreindexWordSegments.construct(reader, segmentWordsFile, segmentCountsFile);
var docs = PrioPreindexDocuments.construct(docsFile, workDir, reader, docIdRewriter, positionsFileConstructor, segments);
return new PrioPreindex(segments, docs);
}
/** Close the associated memory mapped areas and return
* a dehydrated version of this object that can be re-opened
* later.
*/
public PrioPreindexReference closeToReference() {
try {
return new PrioPreindexReference(segments, documents);
}
finally {
segments.force();
documents.force();
segments.close();
documents.close();
}
}
/** Transform the preindex into a reverse index */
public void finalizeIndex(Path outputFileDocs, Path outputFileWords) throws IOException {
var offsets = segments.counts;
Files.deleteIfExists(outputFileDocs);
Files.deleteIfExists(outputFileWords);
// Estimate the size of the docs index data
offsets.transformEach(0, offsets.size(), new CountToOffsetTransformer(2));
IndexSizeEstimator sizeEstimator = new IndexSizeEstimator(ReverseIndexParameters.docsBTreeContext, 2);
offsets.fold(0, 0, offsets.size(), sizeEstimator);
// Write the docs file
LongArray finalDocs = LongArrayFactory.mmapForWritingConfined(outputFileDocs, sizeEstimator.size);
try (var intermediateDocChannel = documents.createDocumentsFileChannel()) {
offsets.transformEachIO(0, offsets.size(),
new PrioIndexBTreeTransformer(finalDocs, 2,
ReverseIndexParameters.docsBTreeContext,
intermediateDocChannel));
intermediateDocChannel.force(false);
}
LongArray wordIds = segments.wordIds;
if (offsets.size() != wordIds.size())
throw new IllegalStateException("Offsets and word-ids of different size");
if (offsets.size() > Integer.MAX_VALUE) {
throw new IllegalStateException("offsets.size() too big!");
}
// Estimate the size of the words index data
long wordsSize = ReverseIndexParameters.wordsBTreeContext.calculateSize((int) offsets.size());
// Construct the tree
LongArray wordsArray = LongArrayFactory.mmapForWritingConfined(outputFileWords, wordsSize);
new BTreeWriter(wordsArray, ReverseIndexParameters.wordsBTreeContext)
.write(0, (int) offsets.size(), mapRegion -> {
for (long i = 0; i < offsets.size(); i++) {
mapRegion.set(2*i, wordIds.get(i));
mapRegion.set(2*i + 1, offsets.get(i));
}
});
finalDocs.force();
finalDocs.close();
wordsArray.force();
wordsArray.close();
}
/** Delete all files associated with this pre-index */
public void delete() throws IOException {
segments.delete();
documents.delete();
}
public static PrioPreindex merge(Path destDir,
PrioPreindex left,
PrioPreindex right) throws IOException {
PrioPreindexWordSegments mergingSegment =
createMergedSegmentWordFile(destDir, left.segments, right.segments);
var mergingIter = mergingSegment.constructionIterator(2);
var leftIter = left.segments.iterator(2);
var rightIter = right.segments.iterator(2);
Path docsFile = Files.createTempFile(destDir, "docs", ".dat");
LongArray mergedDocuments = LongArrayFactory.mmapForWritingConfined(docsFile, left.documents.size() + right.documents.size());
leftIter.next();
rightIter.next();
try (FileChannel leftChannel = left.documents.createDocumentsFileChannel();
FileChannel rightChannel = right.documents.createDocumentsFileChannel())
{
while (mergingIter.canPutMore()
&& leftIter.isPositionBeforeEnd()
&& rightIter.isPositionBeforeEnd())
{
final long currentWord = mergingIter.wordId;
if (leftIter.wordId == currentWord && rightIter.wordId == currentWord)
{
// both inputs have documents for the current word
mergeSegments(leftIter, rightIter,
left.documents, right.documents,
mergedDocuments, mergingIter);
}
else if (leftIter.wordId == currentWord) {
if (!copySegment(leftIter, mergedDocuments, leftChannel, mergingIter))
break;
}
else if (rightIter.wordId == currentWord) {
if (!copySegment(rightIter, mergedDocuments, rightChannel, mergingIter))
break;
}
else assert false : "This should never happen"; // the helvetica scenario
}
if (leftIter.isPositionBeforeEnd()) {
while (copySegment(leftIter, mergedDocuments, leftChannel, mergingIter));
}
if (rightIter.isPositionBeforeEnd()) {
while (copySegment(rightIter, mergedDocuments, rightChannel, mergingIter));
}
}
if (leftIter.isPositionBeforeEnd())
throw new IllegalStateException("Left has more to go");
if (rightIter.isPositionBeforeEnd())
throw new IllegalStateException("Right has more to go");
if (mergingIter.canPutMore())
throw new IllegalStateException("Source iters ran dry before merging iter");
mergingSegment.force();
// We may have overestimated the size of the merged docs size in the case there were
// duplicates in the data, so we need to shrink it to the actual size we wrote.
mergedDocuments = shrinkMergedDocuments(mergedDocuments,
docsFile, 2 * mergingSegment.totalSize());
return new PrioPreindex(
mergingSegment,
new PrioPreindexDocuments(mergedDocuments, docsFile)
);
}
/** Create a segment word file with each word from both inputs, with zero counts for all the data.
* This is an intermediate product in merging.
*/
static PrioPreindexWordSegments createMergedSegmentWordFile(Path destDir,
PrioPreindexWordSegments left,
PrioPreindexWordSegments right) throws IOException {
Path segmentWordsFile = Files.createTempFile(destDir, "segment_words", ".dat");
Path segmentCountsFile = Files.createTempFile(destDir, "segment_counts", ".dat");
// We need total size to request a direct LongArray range. Seems slower, but is faster.
// ... see LongArray.directRangeIfPossible(long start, long end)
long segmentsSize = countDistinctElements(left.wordIds, right.wordIds,
0, left.wordIds.size(),
0, right.wordIds.size());
LongArray wordIdsFile = LongArrayFactory.mmapForWritingConfined(segmentWordsFile, segmentsSize);
mergeArrays(wordIdsFile, left.wordIds, right.wordIds,
0,
0, left.wordIds.size(),
0, right.wordIds.size());
LongArray counts = LongArrayFactory.mmapForWritingConfined(segmentCountsFile, segmentsSize);
return new PrioPreindexWordSegments(wordIdsFile, counts, segmentWordsFile, segmentCountsFile);
}
/** It's possible we overestimated the necessary size of the documents file,
* this will permit us to shrink it down to the smallest necessary size.
*/
private static LongArray shrinkMergedDocuments(LongArray mergedDocuments, Path docsFile, long sizeLongs) throws IOException {
mergedDocuments.force();
long beforeSize = mergedDocuments.size();
long afterSize = sizeLongs * 8;
if (beforeSize != afterSize) {
mergedDocuments.close();
try (var bc = Files.newByteChannel(docsFile, StandardOpenOption.WRITE)) {
bc.truncate(sizeLongs * 8);
}
logger.info("Shrunk {} from {}b to {}b", docsFile, beforeSize, afterSize);
mergedDocuments = LongArrayFactory.mmapForWritingConfined(docsFile, sizeLongs);
}
return mergedDocuments;
}
/** Merge contents of the segments indicated by leftIter and rightIter into the destionation
* segment, and advance the construction iterator with the appropriate size.
*/
private static void mergeSegments(PrioPreindexWordSegments.SegmentIterator leftIter,
PrioPreindexWordSegments.SegmentIterator rightIter,
PrioPreindexDocuments left,
PrioPreindexDocuments right,
LongArray dest,
PrioPreindexWordSegments.SegmentConstructionIterator destIter)
{
long segSize = mergeArrays2(dest,
left.documents,
right.documents,
destIter.startOffset,
leftIter.startOffset, leftIter.endOffset,
rightIter.startOffset, rightIter.endOffset);
long distinct = segSize / 2;
destIter.putNext(distinct);
leftIter.next();
rightIter.next();
}
/** Copy the data from the source segment at the position and length indicated by sourceIter,
* into the destination segment, and advance the construction iterator.
*/
private static boolean copySegment(PrioPreindexWordSegments.SegmentIterator sourceIter,
LongArray dest,
FileChannel sourceChannel,
PrioPreindexWordSegments.SegmentConstructionIterator mergingIter) throws IOException {
long size = sourceIter.endOffset - sourceIter.startOffset;
long start = mergingIter.startOffset;
long end = start + size;
dest.transferFrom(sourceChannel,
sourceIter.startOffset,
mergingIter.startOffset,
end);
boolean putNext = mergingIter.putNext(size / 2);
boolean iterNext = sourceIter.next();
if (!putNext && iterNext)
throw new IllegalStateException("Source iterator ran out before dest iterator?!");
return iterNext;
}
}

View File

@ -0,0 +1,141 @@
package nu.marginalia.index.construction.prio;
import lombok.SneakyThrows;
import nu.marginalia.array.LongArray;
import nu.marginalia.array.LongArrayFactory;
import nu.marginalia.index.construction.DocIdRewriter;
import nu.marginalia.index.construction.PositionsFileConstructor;
import nu.marginalia.index.journal.reader.IndexJournalReader;
import nu.marginalia.rwf.RandomFileAssembler;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import java.io.IOException;
import java.nio.channels.FileChannel;
import java.nio.file.Files;
import java.nio.file.Path;
import java.nio.file.StandardOpenOption;
import java.util.concurrent.ExecutorService;
import java.util.concurrent.Executors;
import java.util.concurrent.TimeUnit;
/** A LongArray with document data, segmented according to
* the associated ReversePreindexWordSegments data
*/
public class PrioPreindexDocuments {
public final LongArray documents;
private static PositionsFileConstructor positionsFileConstructor;
private static final int RECORD_SIZE_LONGS = 2;
private static final Logger logger = LoggerFactory.getLogger(PrioPreindexDocuments.class);
public final Path file;
public PrioPreindexDocuments(LongArray documents, Path file) {
this.documents = documents;
this.file = file;
}
public static PrioPreindexDocuments construct(
Path docsFile,
Path workDir,
IndexJournalReader reader,
DocIdRewriter docIdRewriter,
PositionsFileConstructor positionsFileConstructor,
PrioPreindexWordSegments segments) throws IOException {
PrioPreindexDocuments.positionsFileConstructor = positionsFileConstructor;
createUnsortedDocsFile(docsFile, workDir, reader, segments, docIdRewriter);
LongArray docsFileMap = LongArrayFactory.mmapForModifyingShared(docsFile);
sortDocsFile(docsFileMap, segments);
return new PrioPreindexDocuments(docsFileMap, docsFile);
}
public FileChannel createDocumentsFileChannel() throws IOException {
return (FileChannel) Files.newByteChannel(file, StandardOpenOption.READ);
}
public LongArray slice(long start, long end) {
return documents.range(start, end);
}
public long size() {
return documents.size();
}
private static void createUnsortedDocsFile(Path docsFile,
Path workDir,
IndexJournalReader reader,
PrioPreindexWordSegments segments,
DocIdRewriter docIdRewriter) throws IOException {
long fileSizeLongs = RECORD_SIZE_LONGS * segments.totalSize();
try (var assembly = RandomFileAssembler.create(workDir, fileSizeLongs);
var pointer = reader.newPointer())
{
var offsetMap = segments.asMap(RECORD_SIZE_LONGS);
offsetMap.defaultReturnValue(0);
while (pointer.nextDocument()) {
long rankEncodedId = docIdRewriter.rewriteDocId(pointer.documentId());
for (var termData : pointer) {
long termId = termData.termId();
long offset = offsetMap.addTo(termId, RECORD_SIZE_LONGS);
// write position data to the positions file and get the offset
long encodedPosOffset = positionsFileConstructor.add((byte) termData.metadata(), termData.positionsBuffer());
assembly.put(offset + 0, rankEncodedId);
assembly.put(offset + 1, encodedPosOffset);
}
}
assembly.write(docsFile);
}
}
@SneakyThrows
private static void sortDocsFile(LongArray docsFileMap, PrioPreindexWordSegments segments) throws IOException {
var iter = segments.iterator(RECORD_SIZE_LONGS);
ExecutorService sortingWorkers = Executors.newWorkStealingPool(Runtime.getRuntime().availableProcessors());
while (iter.next()) {
long iterStart = iter.startOffset;
long iterEnd = iter.endOffset;
if (iter.size() < 1024) {
docsFileMap.quickSortN(RECORD_SIZE_LONGS, iterStart, iterEnd);
}
else {
sortingWorkers.execute(() ->
docsFileMap.quickSortN(RECORD_SIZE_LONGS, iterStart, iterEnd));
}
}
sortingWorkers.shutdown();
while (!sortingWorkers.awaitTermination(1, TimeUnit.HOURS));
sortingWorkers.close();
}
public void delete() throws IOException {
Files.delete(this.file);
documents.close();
}
public void close() {
documents.close();
}
public void force() {
documents.force();
}
}

View File

@ -0,0 +1,36 @@
package nu.marginalia.index.construction.prio;
import nu.marginalia.array.LongArrayFactory;
import java.io.IOException;
import java.nio.file.Path;
/** This is a dehydrated version of a PrioPreIndex, that only
* keeps references to its location on disk but does not hold associated
* memory maps.
*/
public record PrioPreindexReference(
Path wordsFile,
Path countsFile,
Path documentsFile
)
{
public PrioPreindexReference(PrioPreindexWordSegments segments, PrioPreindexDocuments documents) {
this(segments.wordsFile, segments.countsFile, documents.file);
}
public PrioPreindex open() throws IOException {
return new PrioPreindex(
new PrioPreindexWordSegments(
LongArrayFactory.mmapForModifyingShared(wordsFile),
LongArrayFactory.mmapForModifyingShared(countsFile),
wordsFile,
countsFile
),
new PrioPreindexDocuments(
LongArrayFactory.mmapForModifyingShared(documentsFile),
documentsFile
)
);
}
}

View File

@ -0,0 +1,205 @@
package nu.marginalia.index.construction.prio;
import it.unimi.dsi.fastutil.longs.Long2IntOpenHashMap;
import it.unimi.dsi.fastutil.longs.Long2LongOpenHashMap;
import it.unimi.dsi.fastutil.longs.LongIterator;
import nu.marginalia.array.LongArray;
import nu.marginalia.array.LongArrayFactory;
import nu.marginalia.index.journal.reader.IndexJournalReader;
import java.io.IOException;
import java.nio.file.Files;
import java.nio.file.Path;
/** A pair of file-backed arrays of sorted wordIds
* and the count of documents associated with each termId.
*/
public class PrioPreindexWordSegments {
public final LongArray wordIds;
public final LongArray counts;
final Path wordsFile;
final Path countsFile;
public PrioPreindexWordSegments(LongArray wordIds,
LongArray counts,
Path wordsFile,
Path countsFile)
{
assert wordIds.size() == counts.size();
this.wordIds = wordIds;
this.counts = counts;
this.wordsFile = wordsFile;
this.countsFile = countsFile;
}
/** Returns a long-long hash map where each key is a termId,
* and each value is the start offset of the data.
*/
public Long2LongOpenHashMap asMap(int recordSize) {
if (wordIds.size() > Integer.MAX_VALUE)
throw new IllegalArgumentException("Cannot create a map with more than Integer.MAX_VALUE entries");
Long2LongOpenHashMap ret = new Long2LongOpenHashMap((int) wordIds.size(), 0.75f);
var iter = iterator(recordSize);
while (iter.next()) {
ret.put(iter.wordId, iter.startOffset);
}
return ret;
}
public static PrioPreindexWordSegments construct(IndexJournalReader reader,
Path wordIdsFile,
Path countsFile)
throws IOException
{
Long2IntOpenHashMap countsMap = new Long2IntOpenHashMap(100_000, 0.75f);
countsMap.defaultReturnValue(0);
reader.forEachWordId(wordId -> countsMap.addTo(wordId, 1));
LongArray words = LongArrayFactory.mmapForWritingConfined(wordIdsFile, countsMap.size());
LongArray counts = LongArrayFactory.mmapForWritingConfined(countsFile, countsMap.size());
// Create the words file by iterating over the map and inserting them into
// the words file in whatever bizarro hash table order they appear in
long i = 0;
LongIterator iter = countsMap.keySet().iterator();
while (iter.hasNext()) {
words.set(i++, iter.nextLong());
}
// Sort the words file
words.sort(0, counts.size());
// Populate the counts
for (i = 0; i < countsMap.size(); i++) {
counts.set(i, countsMap.get(words.get(i)));
}
return new PrioPreindexWordSegments(words, counts, wordIdsFile, countsFile);
}
public SegmentIterator iterator(int recordSize) {
return new SegmentIterator(recordSize);
}
public SegmentConstructionIterator constructionIterator(int recordSize) {
return new SegmentConstructionIterator(recordSize);
}
public long totalSize() {
return counts.fold(0, 0, counts.size(), Long::sum);
}
public void delete() throws IOException {
Files.delete(countsFile);
Files.delete(wordsFile);
counts.close();
wordIds.close();
}
public void force() {
counts.force();
wordIds.force();
}
public void close() {
wordIds.close();
counts.close();
}
public class SegmentIterator {
private final int recordSize;
private final long fileSize;
long wordId;
long startOffset = 0;
long endOffset = 0;
private SegmentIterator(int recordSize) {
this.recordSize = recordSize;
this.fileSize = wordIds.size();
}
private long i = -1;
public long idx() {
return i;
}
public boolean next() {
if (++i >= fileSize) {
wordId = Long.MIN_VALUE;
return false;
}
wordId = wordIds.get(i);
startOffset = endOffset;
endOffset = startOffset + recordSize * counts.get(i);
return true;
}
public boolean hasMorePositions() {
return i + 1 < wordIds.size();
}
public boolean isPositionBeforeEnd() {
return i < wordIds.size();
}
public long size() {
return endOffset - startOffset;
}
}
class SegmentConstructionIterator {
private final int recordSize;
private final long fileSize;
long wordId;
long startOffset = 0;
long endOffset = 0;
private SegmentConstructionIterator(int recordSize) {
this.recordSize = recordSize;
this.fileSize = wordIds.size();
if (fileSize == 0) {
throw new IllegalArgumentException("Cannot construct zero-length word segment file");
}
this.wordId = wordIds.get(0);
}
private long i = 0;
public long idx() {
return i;
}
public boolean putNext(long size) {
if (i >= fileSize)
return false;
endOffset = startOffset + recordSize * size;
counts.set(i, size);
startOffset = endOffset;
endOffset = -1;
i++;
if (i == fileSize) {
// We've reached the end of the iteration and there is no
// "next" termId to fetch
wordId = Long.MIN_VALUE;
return false;
}
else {
wordId = wordIds.get(i);
return true;
}
}
public boolean canPutMore() {
return i < wordIds.size();
}
}
}

View File

@ -4,9 +4,9 @@ import it.unimi.dsi.fastutil.ints.IntList;
import nu.marginalia.array.page.LongQueryBuffer;
import nu.marginalia.index.construction.DocIdRewriter;
import nu.marginalia.index.construction.PositionsFileConstructor;
import nu.marginalia.index.construction.ReversePreindex;
import nu.marginalia.index.construction.TestJournalFactory;
import nu.marginalia.index.construction.TestJournalFactory.EntryDataWithWordMeta;
import nu.marginalia.index.construction.full.FullPreindex;
import nu.marginalia.index.construction.full.TestJournalFactory;
import nu.marginalia.index.construction.full.TestJournalFactory.EntryDataWithWordMeta;
import nu.marginalia.index.positions.PositionsFileReader;
import org.junit.jupiter.api.AfterEach;
import org.junit.jupiter.api.BeforeEach;
@ -19,7 +19,7 @@ import java.nio.file.Path;
import java.util.ArrayList;
import java.util.List;
import static nu.marginalia.index.construction.TestJournalFactory.wm;
import static nu.marginalia.index.construction.full.TestJournalFactory.wm;
import static org.junit.jupiter.api.Assertions.*;
class ReverseIndexReaderTest {
@ -99,7 +99,7 @@ class ReverseIndexReaderTest {
Path wordsFile = tempDir.resolve("words.dat");
try (var positionsFileConstructor = new PositionsFileConstructor(posFile)) {
var preindex = ReversePreindex.constructPreindex(reader,
var preindex = FullPreindex.constructPreindex(reader,
positionsFileConstructor,
DocIdRewriter.identity(), tempDir);
preindex.finalizeIndex(docsFile, wordsFile);

View File

@ -1,5 +1,7 @@
package nu.marginalia.index.construction;
package nu.marginalia.index.construction.full;
import nu.marginalia.index.construction.DocIdRewriter;
import nu.marginalia.index.construction.PositionsFileConstructor;
import org.junit.jupiter.api.AfterEach;
import org.junit.jupiter.api.BeforeEach;
import org.junit.jupiter.api.Test;
@ -11,10 +13,10 @@ import java.util.ArrayList;
import java.util.Arrays;
import java.util.List;
import static nu.marginalia.index.construction.TestJournalFactory.EntryData;
import static nu.marginalia.index.construction.full.TestJournalFactory.EntryData;
import static org.junit.jupiter.api.Assertions.assertEquals;
class ReversePreindexDocsTest {
class FullPreindexDocsTest {
Path countsFile;
Path wordsIdFile;
Path docsFile;
@ -57,8 +59,8 @@ class ReversePreindexDocsTest {
new EntryData(-0xF00BA3L, 0, 10, 40, -100, 33)
);
var segments = ReversePreindexWordSegments.construct(reader, wordsIdFile, countsFile);
var docs = ReversePreindexDocuments.construct(docsFile, tempDir, reader, DocIdRewriter.identity(), new PositionsFileConstructor(positionsFile), segments);
var segments = FullPreindexWordSegments.construct(reader, wordsIdFile, countsFile);
var docs = FullPreindexDocuments.construct(docsFile, tempDir, reader, DocIdRewriter.identity(), new PositionsFileConstructor(positionsFile), segments);
List<TestSegmentData> expected = List.of(
new TestSegmentData(-100, 0, 2, new long[] { -0xF00BA3L, 0 }),
@ -86,8 +88,8 @@ class ReversePreindexDocsTest {
new EntryData(-0xF00BA3L, 0, 4, 4)
);
var segments = ReversePreindexWordSegments.construct(reader, wordsIdFile, countsFile);
var docs = ReversePreindexDocuments.construct(docsFile, tempDir, reader, DocIdRewriter.identity(),
var segments = FullPreindexWordSegments.construct(reader, wordsIdFile, countsFile);
var docs = FullPreindexDocuments.construct(docsFile, tempDir, reader, DocIdRewriter.identity(),
new PositionsFileConstructor(positionsFile),
segments);
@ -115,8 +117,8 @@ class ReversePreindexDocsTest {
new EntryData(0xF00BA4L, 0, 15, 30, -100, 33)
);
var segments = ReversePreindexWordSegments.construct(reader, wordsIdFile, countsFile);
var docs = ReversePreindexDocuments.construct(docsFile, tempDir, reader, DocIdRewriter.identity(),
var segments = FullPreindexWordSegments.construct(reader, wordsIdFile, countsFile);
var docs = FullPreindexDocuments.construct(docsFile, tempDir, reader, DocIdRewriter.identity(),
new PositionsFileConstructor(positionsFile),
segments);

View File

@ -1,8 +1,10 @@
package nu.marginalia.index.construction;
package nu.marginalia.index.construction.full;
import nu.marginalia.array.LongArrayFactory;
import nu.marginalia.btree.model.BTreeHeader;
import nu.marginalia.index.construction.DocIdRewriter;
import nu.marginalia.index.construction.PositionsFileConstructor;
import org.junit.jupiter.api.AfterEach;
import org.junit.jupiter.api.BeforeEach;
import org.junit.jupiter.api.Test;
@ -12,11 +14,11 @@ import java.nio.file.Files;
import java.nio.file.Path;
import java.util.*;
import static nu.marginalia.index.construction.TestJournalFactory.*;
import static nu.marginalia.index.construction.full.TestJournalFactory.*;
import static org.junit.jupiter.api.Assertions.assertEquals;
import static org.junit.jupiter.api.Assertions.assertTrue;
class ReversePreindexFinalizeTest {
class FullPreindexFinalizeTest {
TestJournalFactory journalFactory;
Path positionsFile;
Path countsFile;
@ -52,7 +54,7 @@ class ReversePreindexFinalizeTest {
@Test
public void testFinalizeSimple() throws IOException {
var reader = journalFactory.createReader(new EntryDataWithWordMeta(100, 101, wm(50, 51)));
var preindex = ReversePreindex.constructPreindex(reader,
var preindex = FullPreindex.constructPreindex(reader,
new PositionsFileConstructor(positionsFile),
DocIdRewriter.identity(), tempDir);
@ -90,7 +92,7 @@ class ReversePreindexFinalizeTest {
new EntryDataWithWordMeta(101, 101, wm(51, 52))
);
var preindex = ReversePreindex.constructPreindex(reader,
var preindex = FullPreindex.constructPreindex(reader,
new PositionsFileConstructor(positionsFile),
DocIdRewriter.identity(), tempDir);

View File

@ -1,6 +1,8 @@
package nu.marginalia.index.construction;
package nu.marginalia.index.construction.full;
import nu.marginalia.index.construction.DocIdRewriter;
import nu.marginalia.index.construction.PositionsFileConstructor;
import org.junit.jupiter.api.AfterEach;
import org.junit.jupiter.api.BeforeEach;
import org.junit.jupiter.api.Test;
@ -10,10 +12,10 @@ import java.nio.file.Files;
import java.nio.file.Path;
import java.util.*;
import static nu.marginalia.index.construction.TestJournalFactory.*;
import static nu.marginalia.index.construction.full.TestJournalFactory.*;
import static org.junit.jupiter.api.Assertions.assertEquals;
class ReversePreindexMergeTest {
class FullPreindexMergeTest {
TestJournalFactory journalFactory;
Path countsFile;
Path wordsIdFile;
@ -46,19 +48,19 @@ class ReversePreindexMergeTest {
Files.delete(tempDir);
}
public ReversePreindex runMergeScenario(
public FullPreindex runMergeScenario(
List<EntryDataWithWordMeta> leftData,
List<EntryDataWithWordMeta> rightData
) throws IOException {
var reader1 = journalFactory.createReader(leftData.toArray(EntryDataWithWordMeta[]::new));
var reader2 = journalFactory.createReader(rightData.toArray(EntryDataWithWordMeta[]::new));
var left = ReversePreindex.constructPreindex(reader1, new PositionsFileConstructor(positionsFile), DocIdRewriter.identity(), tempDir);
var right = ReversePreindex.constructPreindex(reader2, new PositionsFileConstructor(positionsFile), DocIdRewriter.identity(), tempDir);
return ReversePreindex.merge(tempDir, left, right);
var left = FullPreindex.constructPreindex(reader1, new PositionsFileConstructor(positionsFile), DocIdRewriter.identity(), tempDir);
var right = FullPreindex.constructPreindex(reader2, new PositionsFileConstructor(positionsFile), DocIdRewriter.identity(), tempDir);
return FullPreindex.merge(tempDir, left, right);
}
private List<TestSegmentData> getData(ReversePreindex merged) {
private List<TestSegmentData> getData(FullPreindex merged) {
var iter = merged.segments.iterator(2);
List<TestSegmentData> actual = new ArrayList<>();
while (iter.next()) {

View File

@ -1,4 +1,4 @@
package nu.marginalia.index.construction;
package nu.marginalia.index.construction.full;
import nu.marginalia.array.LongArray;
import org.junit.jupiter.api.AfterEach;
@ -11,10 +11,10 @@ import java.nio.file.Path;
import java.util.ArrayList;
import java.util.List;
import static nu.marginalia.index.construction.TestJournalFactory.*;
import static nu.marginalia.index.construction.full.TestJournalFactory.*;
import static org.junit.jupiter.api.Assertions.*;
class ReversePreindexWordSegmentsTest {
class FullPreindexWordSegmentsTest {
Path countsFile;
Path wordsIdFile;
Path docsFile;
@ -51,7 +51,7 @@ class ReversePreindexWordSegmentsTest {
new EntryData(-0xF00BA3L, 0, 1L<<33)
);
var segments = ReversePreindexWordSegments.construct(reader, wordsIdFile, countsFile);
var segments = FullPreindexWordSegments.construct(reader, wordsIdFile, countsFile);
var iter = segments.iterator(1);
List<TestSegmentData> expected = List.of(
@ -72,7 +72,7 @@ class ReversePreindexWordSegmentsTest {
new EntryData(-0xF00BA3L, 0, 5, 5)
);
var segments = ReversePreindexWordSegments.construct(reader, wordsIdFile, countsFile);
var segments = FullPreindexWordSegments.construct(reader, wordsIdFile, countsFile);
var iter = segments.iterator(1);
List<TestSegmentData> expected = List.of(
@ -94,7 +94,7 @@ class ReversePreindexWordSegmentsTest {
new EntryData(-0xF00BA3L, 0, 10, 40, -100, 33)
);
var segments = ReversePreindexWordSegments.construct(reader, wordsIdFile, countsFile);
var segments = FullPreindexWordSegments.construct(reader, wordsIdFile, countsFile);
var iter = segments.iterator(1);
List<TestSegmentData> expected = List.of(
@ -120,7 +120,7 @@ class ReversePreindexWordSegmentsTest {
new EntryData(0xF00BA4L, 0, 15, 30, -100, 33)
);
var segments = ReversePreindexWordSegments.construct(reader, wordsIdFile, countsFile);
var segments = FullPreindexWordSegments.construct(reader, wordsIdFile, countsFile);
var iter = segments.iterator(1);
List<TestSegmentData> expected = List.of(
@ -148,7 +148,7 @@ class ReversePreindexWordSegmentsTest {
LongArray countsArray = LongArray.allocate(4);
wordsArray.set(0, -1, -2, -3, -4);
countsArray.set(0, 2, 1, 3, 5);
var segments = new ReversePreindexWordSegments(wordsArray, countsArray, null, null);
var segments = new FullPreindexWordSegments(wordsArray, countsArray, null, null);
var ritr = segments.iterator(1);
assertTrue(ritr.hasMorePositions());
@ -196,7 +196,7 @@ class ReversePreindexWordSegmentsTest {
LongArray wordsArray = LongArray.allocate(4);
LongArray countsArray = LongArray.allocate(4);
wordsArray.set(0, -1, -2, -3, -4);
var segments = new ReversePreindexWordSegments(wordsArray, countsArray, null, null);
var segments = new FullPreindexWordSegments(wordsArray, countsArray, null, null);
var citr = segments.constructionIterator(1);
assertEquals(-1, citr.wordId);

View File

@ -1,4 +1,4 @@
package nu.marginalia.index.construction;
package nu.marginalia.index.construction.full;
import nu.marginalia.index.journal.model.IndexJournalEntryData;
import nu.marginalia.index.journal.model.IndexJournalEntryHeader;

View File

@ -1,4 +1,4 @@
package nu.marginalia.index.construction;
package nu.marginalia.index.construction.full;
import java.util.Arrays;

View File

@ -3,13 +3,11 @@ package nu.marginalia.index;
import com.google.inject.Guice;
import com.google.inject.Inject;
import it.unimi.dsi.fastutil.ints.IntList;
import it.unimi.dsi.fastutil.longs.LongArrayList;
import it.unimi.dsi.fastutil.longs.LongList;
import nu.marginalia.IndexLocations;
import nu.marginalia.array.page.LongQueryBuffer;
import nu.marginalia.hash.MurmurHash3_128;
import nu.marginalia.index.construction.DocIdRewriter;
import nu.marginalia.index.construction.ReverseIndexConstructor;
import nu.marginalia.index.construction.full.FullIndexConstructor;
import nu.marginalia.index.domainrankings.DomainRankings;
import nu.marginalia.index.forward.ForwardIndexConverter;
import nu.marginalia.index.forward.ForwardIndexFileNames;
@ -33,7 +31,6 @@ import nu.marginalia.model.idx.WordMetadata;
import nu.marginalia.process.control.FakeProcessHeartbeat;
import nu.marginalia.process.control.ProcessHeartbeat;
import nu.marginalia.sequence.GammaCodedSequence;
import nu.marginalia.service.control.ServiceHeartbeat;
import nu.marginalia.service.server.Initialization;
import nu.marginalia.storage.FileStorageService;
import org.junit.jupiter.api.AfterEach;
@ -247,7 +244,7 @@ public class CombinedIndexReaderTest {
if (!Files.isDirectory(tmpDir)) Files.createDirectories(tmpDir);
var constructor =
new ReverseIndexConstructor(
new FullIndexConstructor(
outputFileDocs,
outputFileWords,
outputFilePositions,
@ -267,7 +264,7 @@ public class CombinedIndexReaderTest {
if (!Files.isDirectory(tmpDir)) Files.createDirectories(tmpDir);
var constructor = new ReverseIndexConstructor(
var constructor = new FullIndexConstructor(
outputFileDocs,
outputFileWords,
outputFilePositions,

View File

@ -14,7 +14,7 @@ import nu.marginalia.process.control.ProcessHeartbeat;
import nu.marginalia.sequence.GammaCodedSequence;
import nu.marginalia.storage.FileStorageService;
import nu.marginalia.index.construction.DocIdRewriter;
import nu.marginalia.index.construction.ReverseIndexConstructor;
import nu.marginalia.index.construction.full.FullIndexConstructor;
import nu.marginalia.index.forward.ForwardIndexConverter;
import nu.marginalia.index.forward.ForwardIndexFileNames;
import nu.marginalia.index.journal.model.IndexJournalEntryHeader;
@ -291,7 +291,7 @@ public class IndexQueryServiceIntegrationSmokeTest {
if (!Files.isDirectory(tmpDir)) Files.createDirectories(tmpDir);
var constructor = new ReverseIndexConstructor(
var constructor = new FullIndexConstructor(
outputFileDocs,
outputFileWords,
outputFilePositions,
@ -313,7 +313,7 @@ public class IndexQueryServiceIntegrationSmokeTest {
if (!Files.isDirectory(tmpDir)) Files.createDirectories(tmpDir);
var constructor = new ReverseIndexConstructor(
var constructor = new FullIndexConstructor(
outputFileDocs,
outputFileWords,
outputFilePositions,

View File

@ -7,13 +7,13 @@ import nu.marginalia.api.searchquery.model.query.SearchCoherenceConstraint;
import nu.marginalia.api.searchquery.model.query.SearchSpecification;
import nu.marginalia.api.searchquery.model.query.SearchQuery;
import nu.marginalia.api.searchquery.model.results.ResultRankingParameters;
import nu.marginalia.index.construction.full.FullIndexConstructor;
import nu.marginalia.index.index.StatefulIndex;
import nu.marginalia.index.journal.model.IndexJournalEntryData;
import nu.marginalia.sequence.GammaCodedSequence;
import nu.marginalia.storage.FileStorageService;
import nu.marginalia.hash.MurmurHash3_128;
import nu.marginalia.index.construction.DocIdRewriter;
import nu.marginalia.index.construction.ReverseIndexConstructor;
import nu.marginalia.index.forward.ForwardIndexConverter;
import nu.marginalia.index.forward.ForwardIndexFileNames;
import nu.marginalia.index.journal.model.IndexJournalEntryHeader;
@ -493,7 +493,7 @@ public class IndexQueryServiceIntegrationTest {
if (!Files.isDirectory(tmpDir)) Files.createDirectories(tmpDir);
var constructor =
new ReverseIndexConstructor(
new FullIndexConstructor(
outputFileDocs,
outputFileWords,
outputFilePositions,
@ -513,7 +513,7 @@ public class IndexQueryServiceIntegrationTest {
if (!Files.isDirectory(tmpDir)) Files.createDirectories(tmpDir);
var constructor = new ReverseIndexConstructor(
var constructor = new FullIndexConstructor(
outputFileDocs,
outputFileWords,
outputFilePositions,

View File

@ -6,10 +6,11 @@ import com.google.inject.Inject;
import nu.marginalia.IndexLocations;
import nu.marginalia.ProcessConfiguration;
import nu.marginalia.ProcessConfigurationModule;
import nu.marginalia.index.construction.prio.PrioIndexConstructor;
import nu.marginalia.index.domainrankings.DomainRankings;
import nu.marginalia.service.ProcessMainClass;
import nu.marginalia.storage.FileStorageService;
import nu.marginalia.index.construction.ReverseIndexConstructor;
import nu.marginalia.index.construction.full.FullIndexConstructor;
import nu.marginalia.index.forward.ForwardIndexConverter;
import nu.marginalia.index.forward.ForwardIndexFileNames;
import nu.marginalia.index.journal.reader.IndexJournalReader;
@ -117,7 +118,7 @@ public class IndexConstructorMain extends ProcessMainClass {
if (!Files.isDirectory(tmpDir)) Files.createDirectories(tmpDir);
var constructor = new ReverseIndexConstructor(
var constructor = new FullIndexConstructor(
outputFileDocs,
outputFileWords,
outputFilePositions,
@ -142,7 +143,7 @@ public class IndexConstructorMain extends ProcessMainClass {
// important to the document. This filter will act on the encoded {@see WordMetadata}
LongPredicate wordMetaFilter = getPriorityIndexWordMetaFilter();
var constructor = new ReverseIndexConstructor(
var constructor = new PrioIndexConstructor(
outputFileDocs,
outputFileWords,
outputFilePositions,

View File

@ -17,7 +17,7 @@ import nu.marginalia.functions.searchquery.QueryFactory;
import nu.marginalia.index.IndexGrpcService;
import nu.marginalia.index.ReverseIndexFullFileNames;
import nu.marginalia.index.ReverseIndexPrioFileNames;
import nu.marginalia.index.construction.ReverseIndexConstructor;
import nu.marginalia.index.construction.full.FullIndexConstructor;
import nu.marginalia.index.domainrankings.DomainRankings;
import nu.marginalia.index.forward.ForwardIndexConverter;
import nu.marginalia.index.forward.ForwardIndexFileNames;
@ -244,7 +244,7 @@ public class IntegrationTest {
if (!Files.isDirectory(tmpDir)) Files.createDirectories(tmpDir);
var constructor = new ReverseIndexConstructor(
var constructor = new FullIndexConstructor(
outputFileDocs,
outputFileWords,
outputFilePositions,
@ -269,7 +269,7 @@ public class IntegrationTest {
// important to the document. This filter will act on the encoded {@see WordMetadata}
LongPredicate wordMetaFilter = getPriorityIndexWordMetaFilter();
var constructor = new ReverseIndexConstructor(
var constructor = new FullIndexConstructor(
outputFileDocs,
outputFileWords,
outputFilePositions,