mirror of
https://github.com/MarginaliaSearch/MarginaliaSearch.git
synced 2025-02-23 21:18:58 +00:00
(index-reverse) Split index construction into separate packages for full and priority index
This commit is contained in:
parent
a4ecd5f4ce
commit
85c99ae808
@ -1,4 +1,4 @@
|
||||
package nu.marginalia.index.construction;
|
||||
package nu.marginalia.index.construction.full;
|
||||
|
||||
import nu.marginalia.array.LongArray;
|
||||
import nu.marginalia.array.algo.LongArrayTransformations;
|
||||
@ -9,7 +9,7 @@ import java.io.IOException;
|
||||
import java.nio.channels.FileChannel;
|
||||
|
||||
/** Constructs the BTrees in a reverse index */
|
||||
public class ReverseIndexBTreeTransformer implements LongArrayTransformations.LongIOTransformer {
|
||||
public class FullIndexBTreeTransformer implements LongArrayTransformations.LongIOTransformer {
|
||||
private final BTreeWriter writer;
|
||||
private final FileChannel intermediateChannel;
|
||||
|
||||
@ -18,10 +18,10 @@ public class ReverseIndexBTreeTransformer implements LongArrayTransformations.Lo
|
||||
long start = 0;
|
||||
long writeOffset = 0;
|
||||
|
||||
public ReverseIndexBTreeTransformer(LongArray urlsFileMap,
|
||||
int entrySize,
|
||||
BTreeContext bTreeContext,
|
||||
FileChannel intermediateChannel) {
|
||||
public FullIndexBTreeTransformer(LongArray urlsFileMap,
|
||||
int entrySize,
|
||||
BTreeContext bTreeContext,
|
||||
FileChannel intermediateChannel) {
|
||||
this.writer = new BTreeWriter(urlsFileMap, bTreeContext);
|
||||
this.entrySize = entrySize;
|
||||
this.intermediateChannel = intermediateChannel;
|
@ -1,6 +1,9 @@
|
||||
package nu.marginalia.index.construction;
|
||||
package nu.marginalia.index.construction.full;
|
||||
|
||||
import lombok.SneakyThrows;
|
||||
import nu.marginalia.index.construction.DocIdRewriter;
|
||||
import nu.marginalia.index.construction.JournalReaderSource;
|
||||
import nu.marginalia.index.construction.PositionsFileConstructor;
|
||||
import nu.marginalia.process.control.ProcessHeartbeat;
|
||||
import nu.marginalia.index.journal.IndexJournalFileNames;
|
||||
import org.slf4j.Logger;
|
||||
@ -10,9 +13,9 @@ import java.io.IOException;
|
||||
import java.nio.file.Path;
|
||||
import java.util.concurrent.atomic.AtomicInteger;
|
||||
|
||||
public class ReverseIndexConstructor {
|
||||
public class FullIndexConstructor {
|
||||
|
||||
private static final Logger logger = LoggerFactory.getLogger(ReverseIndexConstructor.class);
|
||||
private static final Logger logger = LoggerFactory.getLogger(FullIndexConstructor.class);
|
||||
|
||||
public enum CreateReverseIndexSteps {
|
||||
CONSTRUCT,
|
||||
@ -27,12 +30,12 @@ public class ReverseIndexConstructor {
|
||||
private final DocIdRewriter docIdRewriter;
|
||||
private final Path tmpDir;
|
||||
|
||||
public ReverseIndexConstructor(Path outputFileDocs,
|
||||
Path outputFileWords,
|
||||
Path outputFilePositions,
|
||||
JournalReaderSource readerSource,
|
||||
DocIdRewriter docIdRewriter,
|
||||
Path tmpDir) {
|
||||
public FullIndexConstructor(Path outputFileDocs,
|
||||
Path outputFileWords,
|
||||
Path outputFilePositions,
|
||||
JournalReaderSource readerSource,
|
||||
DocIdRewriter docIdRewriter,
|
||||
Path tmpDir) {
|
||||
this.outputFileDocs = outputFileDocs;
|
||||
this.outputFileWords = outputFileWords;
|
||||
this.outputFilePositions = outputFilePositions;
|
||||
@ -77,20 +80,20 @@ public class ReverseIndexConstructor {
|
||||
}
|
||||
|
||||
@SneakyThrows
|
||||
private ReversePreindexReference construct(Path input, PositionsFileConstructor positionsFileConstructor) {
|
||||
return ReversePreindex
|
||||
private FullPreindexReference construct(Path input, PositionsFileConstructor positionsFileConstructor) {
|
||||
return FullPreindex
|
||||
.constructPreindex(readerSource.construct(input), positionsFileConstructor, docIdRewriter, tmpDir)
|
||||
.closeToReference();
|
||||
}
|
||||
|
||||
@SneakyThrows
|
||||
private ReversePreindexReference merge(ReversePreindexReference leftR, ReversePreindexReference rightR) {
|
||||
private FullPreindexReference merge(FullPreindexReference leftR, FullPreindexReference rightR) {
|
||||
|
||||
var left = leftR.open();
|
||||
var right = rightR.open();
|
||||
|
||||
try {
|
||||
return ReversePreindex.merge(tmpDir, left, right).closeToReference();
|
||||
return FullPreindex.merge(tmpDir, left, right).closeToReference();
|
||||
}
|
||||
finally {
|
||||
left.delete();
|
||||
@ -101,7 +104,7 @@ public class ReverseIndexConstructor {
|
||||
}
|
||||
|
||||
@SneakyThrows
|
||||
private void finalizeIndex(ReversePreindexReference finalPR) {
|
||||
private void finalizeIndex(FullPreindexReference finalPR) {
|
||||
var finalP = finalPR.open();
|
||||
finalP.finalizeIndex(outputFileDocs, outputFileWords);
|
||||
finalP.delete();
|
@ -1,9 +1,13 @@
|
||||
package nu.marginalia.index.construction;
|
||||
package nu.marginalia.index.construction.full;
|
||||
|
||||
import nu.marginalia.array.LongArray;
|
||||
import nu.marginalia.array.LongArrayFactory;
|
||||
import nu.marginalia.btree.BTreeWriter;
|
||||
import nu.marginalia.index.ReverseIndexParameters;
|
||||
import nu.marginalia.index.construction.CountToOffsetTransformer;
|
||||
import nu.marginalia.index.construction.DocIdRewriter;
|
||||
import nu.marginalia.index.construction.IndexSizeEstimator;
|
||||
import nu.marginalia.index.construction.PositionsFileConstructor;
|
||||
import nu.marginalia.index.journal.reader.IndexJournalReader;
|
||||
import org.slf4j.Logger;
|
||||
import org.slf4j.LoggerFactory;
|
||||
@ -25,13 +29,13 @@ import static nu.marginalia.array.algo.TwoArrayOperations.*;
|
||||
* the union of their data. This operation requires no additional
|
||||
* RAM.
|
||||
*/
|
||||
public class ReversePreindex {
|
||||
final ReversePreindexWordSegments segments;
|
||||
final ReversePreindexDocuments documents;
|
||||
public class FullPreindex {
|
||||
final FullPreindexWordSegments segments;
|
||||
final FullPreindexDocuments documents;
|
||||
|
||||
private static final Logger logger = LoggerFactory.getLogger(ReversePreindex.class);
|
||||
private static final Logger logger = LoggerFactory.getLogger(FullPreindex.class);
|
||||
|
||||
public ReversePreindex(ReversePreindexWordSegments segments, ReversePreindexDocuments documents) {
|
||||
public FullPreindex(FullPreindexWordSegments segments, FullPreindexDocuments documents) {
|
||||
this.segments = segments;
|
||||
this.documents = documents;
|
||||
}
|
||||
@ -39,27 +43,27 @@ public class ReversePreindex {
|
||||
/** Constructs a new preindex with the data associated with reader. The backing files
|
||||
* will have randomly assigned names.
|
||||
*/
|
||||
public static ReversePreindex constructPreindex(IndexJournalReader reader,
|
||||
PositionsFileConstructor positionsFileConstructor,
|
||||
DocIdRewriter docIdRewriter,
|
||||
Path workDir) throws IOException
|
||||
public static FullPreindex constructPreindex(IndexJournalReader reader,
|
||||
PositionsFileConstructor positionsFileConstructor,
|
||||
DocIdRewriter docIdRewriter,
|
||||
Path workDir) throws IOException
|
||||
{
|
||||
Path segmentWordsFile = Files.createTempFile(workDir, "segment_words", ".dat");
|
||||
Path segmentCountsFile = Files.createTempFile(workDir, "segment_counts", ".dat");
|
||||
Path docsFile = Files.createTempFile(workDir, "docs", ".dat");
|
||||
|
||||
var segments = ReversePreindexWordSegments.construct(reader, segmentWordsFile, segmentCountsFile);
|
||||
var docs = ReversePreindexDocuments.construct(docsFile, workDir, reader, docIdRewriter, positionsFileConstructor, segments);
|
||||
return new ReversePreindex(segments, docs);
|
||||
var segments = FullPreindexWordSegments.construct(reader, segmentWordsFile, segmentCountsFile);
|
||||
var docs = FullPreindexDocuments.construct(docsFile, workDir, reader, docIdRewriter, positionsFileConstructor, segments);
|
||||
return new FullPreindex(segments, docs);
|
||||
}
|
||||
|
||||
/** Close the associated memory mapped areas and return
|
||||
* a dehydrated version of this object that can be re-opened
|
||||
* later.
|
||||
*/
|
||||
public ReversePreindexReference closeToReference() {
|
||||
public FullPreindexReference closeToReference() {
|
||||
try {
|
||||
return new ReversePreindexReference(segments, documents);
|
||||
return new FullPreindexReference(segments, documents);
|
||||
}
|
||||
finally {
|
||||
segments.force();
|
||||
@ -85,7 +89,7 @@ public class ReversePreindex {
|
||||
LongArray finalDocs = LongArrayFactory.mmapForWritingConfined(outputFileDocs, sizeEstimator.size);
|
||||
try (var intermediateDocChannel = documents.createDocumentsFileChannel()) {
|
||||
offsets.transformEachIO(0, offsets.size(),
|
||||
new ReverseIndexBTreeTransformer(finalDocs, 2,
|
||||
new FullIndexBTreeTransformer(finalDocs, 2,
|
||||
ReverseIndexParameters.docsBTreeContext,
|
||||
intermediateDocChannel));
|
||||
intermediateDocChannel.force(false);
|
||||
@ -126,11 +130,11 @@ public class ReversePreindex {
|
||||
documents.delete();
|
||||
}
|
||||
|
||||
public static ReversePreindex merge(Path destDir,
|
||||
ReversePreindex left,
|
||||
ReversePreindex right) throws IOException {
|
||||
public static FullPreindex merge(Path destDir,
|
||||
FullPreindex left,
|
||||
FullPreindex right) throws IOException {
|
||||
|
||||
ReversePreindexWordSegments mergingSegment =
|
||||
FullPreindexWordSegments mergingSegment =
|
||||
createMergedSegmentWordFile(destDir, left.segments, right.segments);
|
||||
|
||||
var mergingIter = mergingSegment.constructionIterator(2);
|
||||
@ -198,18 +202,18 @@ public class ReversePreindex {
|
||||
mergedDocuments = shrinkMergedDocuments(mergedDocuments,
|
||||
docsFile, 2 * mergingSegment.totalSize());
|
||||
|
||||
return new ReversePreindex(
|
||||
return new FullPreindex(
|
||||
mergingSegment,
|
||||
new ReversePreindexDocuments(mergedDocuments, docsFile)
|
||||
new FullPreindexDocuments(mergedDocuments, docsFile)
|
||||
);
|
||||
}
|
||||
|
||||
/** Create a segment word file with each word from both inputs, with zero counts for all the data.
|
||||
* This is an intermediate product in merging.
|
||||
*/
|
||||
static ReversePreindexWordSegments createMergedSegmentWordFile(Path destDir,
|
||||
ReversePreindexWordSegments left,
|
||||
ReversePreindexWordSegments right) throws IOException {
|
||||
static FullPreindexWordSegments createMergedSegmentWordFile(Path destDir,
|
||||
FullPreindexWordSegments left,
|
||||
FullPreindexWordSegments right) throws IOException {
|
||||
Path segmentWordsFile = Files.createTempFile(destDir, "segment_words", ".dat");
|
||||
Path segmentCountsFile = Files.createTempFile(destDir, "segment_counts", ".dat");
|
||||
|
||||
@ -228,7 +232,7 @@ public class ReversePreindex {
|
||||
|
||||
LongArray counts = LongArrayFactory.mmapForWritingConfined(segmentCountsFile, segmentsSize);
|
||||
|
||||
return new ReversePreindexWordSegments(wordIdsFile, counts, segmentWordsFile, segmentCountsFile);
|
||||
return new FullPreindexWordSegments(wordIdsFile, counts, segmentWordsFile, segmentCountsFile);
|
||||
}
|
||||
|
||||
/** It's possible we overestimated the necessary size of the documents file,
|
||||
@ -256,12 +260,12 @@ public class ReversePreindex {
|
||||
/** Merge contents of the segments indicated by leftIter and rightIter into the destionation
|
||||
* segment, and advance the construction iterator with the appropriate size.
|
||||
*/
|
||||
private static void mergeSegments(ReversePreindexWordSegments.SegmentIterator leftIter,
|
||||
ReversePreindexWordSegments.SegmentIterator rightIter,
|
||||
ReversePreindexDocuments left,
|
||||
ReversePreindexDocuments right,
|
||||
private static void mergeSegments(FullPreindexWordSegments.SegmentIterator leftIter,
|
||||
FullPreindexWordSegments.SegmentIterator rightIter,
|
||||
FullPreindexDocuments left,
|
||||
FullPreindexDocuments right,
|
||||
LongArray dest,
|
||||
ReversePreindexWordSegments.SegmentConstructionIterator destIter)
|
||||
FullPreindexWordSegments.SegmentConstructionIterator destIter)
|
||||
{
|
||||
long segSize = mergeArrays2(dest,
|
||||
left.documents,
|
||||
@ -279,10 +283,10 @@ public class ReversePreindex {
|
||||
/** Copy the data from the source segment at the position and length indicated by sourceIter,
|
||||
* into the destination segment, and advance the construction iterator.
|
||||
*/
|
||||
private static boolean copySegment(ReversePreindexWordSegments.SegmentIterator sourceIter,
|
||||
LongArray dest,
|
||||
FileChannel sourceChannel,
|
||||
ReversePreindexWordSegments.SegmentConstructionIterator mergingIter) throws IOException {
|
||||
private static boolean copySegment(FullPreindexWordSegments.SegmentIterator sourceIter,
|
||||
LongArray dest,
|
||||
FileChannel sourceChannel,
|
||||
FullPreindexWordSegments.SegmentConstructionIterator mergingIter) throws IOException {
|
||||
|
||||
long size = sourceIter.endOffset - sourceIter.startOffset;
|
||||
long start = mergingIter.startOffset;
|
@ -1,8 +1,10 @@
|
||||
package nu.marginalia.index.construction;
|
||||
package nu.marginalia.index.construction.full;
|
||||
|
||||
import lombok.SneakyThrows;
|
||||
import nu.marginalia.array.LongArray;
|
||||
import nu.marginalia.array.LongArrayFactory;
|
||||
import nu.marginalia.index.construction.DocIdRewriter;
|
||||
import nu.marginalia.index.construction.PositionsFileConstructor;
|
||||
import nu.marginalia.index.journal.reader.IndexJournalReader;
|
||||
import nu.marginalia.rwf.RandomFileAssembler;
|
||||
import org.slf4j.Logger;
|
||||
@ -20,35 +22,35 @@ import java.util.concurrent.TimeUnit;
|
||||
/** A LongArray with document data, segmented according to
|
||||
* the associated ReversePreindexWordSegments data
|
||||
*/
|
||||
public class ReversePreindexDocuments {
|
||||
public class FullPreindexDocuments {
|
||||
public final LongArray documents;
|
||||
|
||||
private static PositionsFileConstructor positionsFileConstructor;
|
||||
private static final int RECORD_SIZE_LONGS = 2;
|
||||
private static final Logger logger = LoggerFactory.getLogger(ReversePreindexDocuments.class);
|
||||
private static final Logger logger = LoggerFactory.getLogger(FullPreindexDocuments.class);
|
||||
|
||||
public final Path file;
|
||||
|
||||
public ReversePreindexDocuments(LongArray documents, Path file) {
|
||||
public FullPreindexDocuments(LongArray documents, Path file) {
|
||||
this.documents = documents;
|
||||
this.file = file;
|
||||
}
|
||||
|
||||
public static ReversePreindexDocuments construct(
|
||||
public static FullPreindexDocuments construct(
|
||||
Path docsFile,
|
||||
Path workDir,
|
||||
IndexJournalReader reader,
|
||||
DocIdRewriter docIdRewriter,
|
||||
PositionsFileConstructor positionsFileConstructor,
|
||||
ReversePreindexWordSegments segments) throws IOException {
|
||||
ReversePreindexDocuments.positionsFileConstructor = positionsFileConstructor;
|
||||
FullPreindexWordSegments segments) throws IOException {
|
||||
FullPreindexDocuments.positionsFileConstructor = positionsFileConstructor;
|
||||
|
||||
createUnsortedDocsFile(docsFile, workDir, reader, segments, docIdRewriter);
|
||||
|
||||
LongArray docsFileMap = LongArrayFactory.mmapForModifyingShared(docsFile);
|
||||
sortDocsFile(docsFileMap, segments);
|
||||
|
||||
return new ReversePreindexDocuments(docsFileMap, docsFile);
|
||||
return new FullPreindexDocuments(docsFileMap, docsFile);
|
||||
}
|
||||
|
||||
public FileChannel createDocumentsFileChannel() throws IOException {
|
||||
@ -67,7 +69,7 @@ public class ReversePreindexDocuments {
|
||||
private static void createUnsortedDocsFile(Path docsFile,
|
||||
Path workDir,
|
||||
IndexJournalReader reader,
|
||||
ReversePreindexWordSegments segments,
|
||||
FullPreindexWordSegments segments,
|
||||
DocIdRewriter docIdRewriter) throws IOException {
|
||||
|
||||
long fileSizeLongs = RECORD_SIZE_LONGS * segments.totalSize();
|
||||
@ -99,7 +101,7 @@ public class ReversePreindexDocuments {
|
||||
}
|
||||
|
||||
@SneakyThrows
|
||||
private static void sortDocsFile(LongArray docsFileMap, ReversePreindexWordSegments segments) throws IOException {
|
||||
private static void sortDocsFile(LongArray docsFileMap, FullPreindexWordSegments segments) throws IOException {
|
||||
|
||||
var iter = segments.iterator(RECORD_SIZE_LONGS);
|
||||
|
@ -1,33 +1,33 @@
|
||||
package nu.marginalia.index.construction;
|
||||
package nu.marginalia.index.construction.full;
|
||||
|
||||
import nu.marginalia.array.LongArrayFactory;
|
||||
|
||||
import java.io.IOException;
|
||||
import java.nio.file.Path;
|
||||
|
||||
/** This is a dehydrated version of a ReversePreIndex, that only
|
||||
/** This is a dehydrated version of a FullPreIndex, that only
|
||||
* keeps references to its location on disk but does not hold associated
|
||||
* memory maps.
|
||||
*/
|
||||
public record ReversePreindexReference(
|
||||
public record FullPreindexReference(
|
||||
Path wordsFile,
|
||||
Path countsFile,
|
||||
Path documentsFile
|
||||
)
|
||||
{
|
||||
public ReversePreindexReference(ReversePreindexWordSegments segments, ReversePreindexDocuments documents) {
|
||||
public FullPreindexReference(FullPreindexWordSegments segments, FullPreindexDocuments documents) {
|
||||
this(segments.wordsFile, segments.countsFile, documents.file);
|
||||
}
|
||||
|
||||
public ReversePreindex open() throws IOException {
|
||||
return new ReversePreindex(
|
||||
new ReversePreindexWordSegments(
|
||||
public FullPreindex open() throws IOException {
|
||||
return new FullPreindex(
|
||||
new FullPreindexWordSegments(
|
||||
LongArrayFactory.mmapForModifyingShared(wordsFile),
|
||||
LongArrayFactory.mmapForModifyingShared(countsFile),
|
||||
wordsFile,
|
||||
countsFile
|
||||
),
|
||||
new ReversePreindexDocuments(
|
||||
new FullPreindexDocuments(
|
||||
LongArrayFactory.mmapForModifyingShared(documentsFile),
|
||||
documentsFile
|
||||
)
|
@ -1,4 +1,4 @@
|
||||
package nu.marginalia.index.construction;
|
||||
package nu.marginalia.index.construction.full;
|
||||
|
||||
import it.unimi.dsi.fastutil.longs.Long2IntOpenHashMap;
|
||||
import it.unimi.dsi.fastutil.longs.Long2LongOpenHashMap;
|
||||
@ -14,17 +14,17 @@ import java.nio.file.Path;
|
||||
/** A pair of file-backed arrays of sorted wordIds
|
||||
* and the count of documents associated with each termId.
|
||||
*/
|
||||
public class ReversePreindexWordSegments {
|
||||
public class FullPreindexWordSegments {
|
||||
public final LongArray wordIds;
|
||||
public final LongArray counts;
|
||||
|
||||
final Path wordsFile;
|
||||
final Path countsFile;
|
||||
|
||||
public ReversePreindexWordSegments(LongArray wordIds,
|
||||
LongArray counts,
|
||||
Path wordsFile,
|
||||
Path countsFile)
|
||||
public FullPreindexWordSegments(LongArray wordIds,
|
||||
LongArray counts,
|
||||
Path wordsFile,
|
||||
Path countsFile)
|
||||
{
|
||||
assert wordIds.size() == counts.size();
|
||||
|
||||
@ -51,9 +51,9 @@ public class ReversePreindexWordSegments {
|
||||
return ret;
|
||||
}
|
||||
|
||||
public static ReversePreindexWordSegments construct(IndexJournalReader reader,
|
||||
Path wordIdsFile,
|
||||
Path countsFile)
|
||||
public static FullPreindexWordSegments construct(IndexJournalReader reader,
|
||||
Path wordIdsFile,
|
||||
Path countsFile)
|
||||
throws IOException
|
||||
{
|
||||
Long2IntOpenHashMap countsMap = new Long2IntOpenHashMap(100_000, 0.75f);
|
||||
@ -79,7 +79,7 @@ public class ReversePreindexWordSegments {
|
||||
counts.set(i, countsMap.get(words.get(i)));
|
||||
}
|
||||
|
||||
return new ReversePreindexWordSegments(words, counts, wordIdsFile, countsFile);
|
||||
return new FullPreindexWordSegments(words, counts, wordIdsFile, countsFile);
|
||||
}
|
||||
|
||||
public SegmentIterator iterator(int recordSize) {
|
@ -0,0 +1,48 @@
|
||||
package nu.marginalia.index.construction.prio;
|
||||
|
||||
import nu.marginalia.array.LongArray;
|
||||
import nu.marginalia.array.algo.LongArrayTransformations;
|
||||
import nu.marginalia.btree.BTreeWriter;
|
||||
import nu.marginalia.btree.model.BTreeContext;
|
||||
|
||||
import java.io.IOException;
|
||||
import java.nio.channels.FileChannel;
|
||||
|
||||
/** Constructs the BTrees in a reverse index */
|
||||
public class PrioIndexBTreeTransformer implements LongArrayTransformations.LongIOTransformer {
|
||||
private final BTreeWriter writer;
|
||||
private final FileChannel intermediateChannel;
|
||||
|
||||
private final int entrySize;
|
||||
|
||||
long start = 0;
|
||||
long writeOffset = 0;
|
||||
|
||||
public PrioIndexBTreeTransformer(LongArray urlsFileMap,
|
||||
int entrySize,
|
||||
BTreeContext bTreeContext,
|
||||
FileChannel intermediateChannel) {
|
||||
this.writer = new BTreeWriter(urlsFileMap, bTreeContext);
|
||||
this.entrySize = entrySize;
|
||||
this.intermediateChannel = intermediateChannel;
|
||||
}
|
||||
|
||||
@Override
|
||||
public long transform(long pos, long end) throws IOException {
|
||||
|
||||
final int size = (int) ((end - start) / entrySize);
|
||||
|
||||
if (size == 0) {
|
||||
return -1;
|
||||
}
|
||||
|
||||
final long offsetForBlock = writeOffset;
|
||||
|
||||
writeOffset += writer.write(writeOffset, size,
|
||||
mapRegion -> mapRegion.transferFrom(intermediateChannel, start, 0, end - start)
|
||||
);
|
||||
|
||||
start = end;
|
||||
return offsetForBlock;
|
||||
}
|
||||
}
|
@ -0,0 +1,114 @@
|
||||
package nu.marginalia.index.construction.prio;
|
||||
|
||||
import lombok.SneakyThrows;
|
||||
import nu.marginalia.index.construction.DocIdRewriter;
|
||||
import nu.marginalia.index.construction.JournalReaderSource;
|
||||
import nu.marginalia.index.construction.PositionsFileConstructor;
|
||||
import nu.marginalia.index.journal.IndexJournalFileNames;
|
||||
import nu.marginalia.process.control.ProcessHeartbeat;
|
||||
import org.slf4j.Logger;
|
||||
import org.slf4j.LoggerFactory;
|
||||
|
||||
import java.io.IOException;
|
||||
import java.nio.file.Path;
|
||||
import java.util.concurrent.atomic.AtomicInteger;
|
||||
|
||||
public class PrioIndexConstructor {
|
||||
|
||||
private static final Logger logger = LoggerFactory.getLogger(PrioIndexConstructor.class);
|
||||
|
||||
public enum CreateReverseIndexSteps {
|
||||
CONSTRUCT,
|
||||
FINALIZE,
|
||||
FINISHED
|
||||
}
|
||||
|
||||
private final Path outputFileDocs;
|
||||
private final Path outputFileWords;
|
||||
private final Path outputFilePositions;
|
||||
private final JournalReaderSource readerSource;
|
||||
private final DocIdRewriter docIdRewriter;
|
||||
private final Path tmpDir;
|
||||
|
||||
public PrioIndexConstructor(Path outputFileDocs,
|
||||
Path outputFileWords,
|
||||
Path outputFilePositions,
|
||||
JournalReaderSource readerSource,
|
||||
DocIdRewriter docIdRewriter,
|
||||
Path tmpDir) {
|
||||
this.outputFileDocs = outputFileDocs;
|
||||
this.outputFileWords = outputFileWords;
|
||||
this.outputFilePositions = outputFilePositions;
|
||||
this.readerSource = readerSource;
|
||||
this.docIdRewriter = docIdRewriter;
|
||||
this.tmpDir = tmpDir;
|
||||
}
|
||||
|
||||
public void createReverseIndex(ProcessHeartbeat processHeartbeat,
|
||||
String processName,
|
||||
Path sourceBaseDir) throws IOException
|
||||
{
|
||||
var inputs = IndexJournalFileNames.findJournalFiles(sourceBaseDir);
|
||||
if (inputs.isEmpty()) {
|
||||
logger.error("No journal files in base dir {}", sourceBaseDir);
|
||||
return;
|
||||
}
|
||||
|
||||
try (var heartbeat = processHeartbeat.createProcessTaskHeartbeat(CreateReverseIndexSteps.class, processName);
|
||||
var preindexHeartbeat = processHeartbeat.createAdHocTaskHeartbeat("constructPreindexes");
|
||||
var posConstructor = new PositionsFileConstructor(outputFilePositions)
|
||||
) {
|
||||
heartbeat.progress(CreateReverseIndexSteps.CONSTRUCT);
|
||||
|
||||
AtomicInteger progress = new AtomicInteger(0);
|
||||
|
||||
inputs
|
||||
.parallelStream()
|
||||
.map(in -> {
|
||||
preindexHeartbeat.progress("PREINDEX/MERGE", progress.incrementAndGet(), inputs.size());
|
||||
return construct(in, posConstructor);
|
||||
})
|
||||
.reduce(this::merge)
|
||||
.ifPresent((index) -> {
|
||||
heartbeat.progress(CreateReverseIndexSteps.FINALIZE);
|
||||
finalizeIndex(index);
|
||||
heartbeat.progress(CreateReverseIndexSteps.FINISHED);
|
||||
});
|
||||
|
||||
heartbeat.progress(CreateReverseIndexSteps.FINISHED);
|
||||
}
|
||||
}
|
||||
|
||||
@SneakyThrows
|
||||
private PrioPreindexReference construct(Path input, PositionsFileConstructor positionsFileConstructor) {
|
||||
return PrioPreindex
|
||||
.constructPreindex(readerSource.construct(input), positionsFileConstructor, docIdRewriter, tmpDir)
|
||||
.closeToReference();
|
||||
}
|
||||
|
||||
@SneakyThrows
|
||||
private PrioPreindexReference merge(PrioPreindexReference leftR, PrioPreindexReference rightR) {
|
||||
|
||||
var left = leftR.open();
|
||||
var right = rightR.open();
|
||||
|
||||
try {
|
||||
return PrioPreindex.merge(tmpDir, left, right).closeToReference();
|
||||
}
|
||||
finally {
|
||||
left.delete();
|
||||
right.delete();
|
||||
}
|
||||
|
||||
|
||||
}
|
||||
|
||||
@SneakyThrows
|
||||
private void finalizeIndex(PrioPreindexReference finalPR) {
|
||||
var finalP = finalPR.open();
|
||||
finalP.finalizeIndex(outputFileDocs, outputFileWords);
|
||||
finalP.delete();
|
||||
}
|
||||
|
||||
|
||||
}
|
@ -0,0 +1,310 @@
|
||||
package nu.marginalia.index.construction.prio;
|
||||
|
||||
import nu.marginalia.array.LongArray;
|
||||
import nu.marginalia.array.LongArrayFactory;
|
||||
import nu.marginalia.btree.BTreeWriter;
|
||||
import nu.marginalia.index.ReverseIndexParameters;
|
||||
import nu.marginalia.index.construction.CountToOffsetTransformer;
|
||||
import nu.marginalia.index.construction.DocIdRewriter;
|
||||
import nu.marginalia.index.construction.IndexSizeEstimator;
|
||||
import nu.marginalia.index.construction.PositionsFileConstructor;
|
||||
import nu.marginalia.index.journal.reader.IndexJournalReader;
|
||||
import org.slf4j.Logger;
|
||||
import org.slf4j.LoggerFactory;
|
||||
|
||||
import java.io.IOException;
|
||||
import java.nio.channels.FileChannel;
|
||||
import java.nio.file.Files;
|
||||
import java.nio.file.Path;
|
||||
import java.nio.file.StandardOpenOption;
|
||||
|
||||
import static nu.marginalia.array.algo.TwoArrayOperations.*;
|
||||
|
||||
/** Contains the data that would go into a reverse index,
|
||||
* that is, a mapping from words to documents, minus the actual
|
||||
* index structure that makes the data quick to access while
|
||||
* searching.
|
||||
* <p>
|
||||
* Two preindexes can be merged into a third preindex containing
|
||||
* the union of their data. This operation requires no additional
|
||||
* RAM.
|
||||
*/
|
||||
public class PrioPreindex {
|
||||
final PrioPreindexWordSegments segments;
|
||||
final PrioPreindexDocuments documents;
|
||||
|
||||
private static final Logger logger = LoggerFactory.getLogger(PrioPreindex.class);
|
||||
|
||||
public PrioPreindex(PrioPreindexWordSegments segments, PrioPreindexDocuments documents) {
|
||||
this.segments = segments;
|
||||
this.documents = documents;
|
||||
}
|
||||
|
||||
/** Constructs a new preindex with the data associated with reader. The backing files
|
||||
* will have randomly assigned names.
|
||||
*/
|
||||
public static PrioPreindex constructPreindex(IndexJournalReader reader,
|
||||
PositionsFileConstructor positionsFileConstructor,
|
||||
DocIdRewriter docIdRewriter,
|
||||
Path workDir) throws IOException
|
||||
{
|
||||
Path segmentWordsFile = Files.createTempFile(workDir, "segment_words", ".dat");
|
||||
Path segmentCountsFile = Files.createTempFile(workDir, "segment_counts", ".dat");
|
||||
Path docsFile = Files.createTempFile(workDir, "docs", ".dat");
|
||||
|
||||
var segments = PrioPreindexWordSegments.construct(reader, segmentWordsFile, segmentCountsFile);
|
||||
var docs = PrioPreindexDocuments.construct(docsFile, workDir, reader, docIdRewriter, positionsFileConstructor, segments);
|
||||
return new PrioPreindex(segments, docs);
|
||||
}
|
||||
|
||||
/** Close the associated memory mapped areas and return
|
||||
* a dehydrated version of this object that can be re-opened
|
||||
* later.
|
||||
*/
|
||||
public PrioPreindexReference closeToReference() {
|
||||
try {
|
||||
return new PrioPreindexReference(segments, documents);
|
||||
}
|
||||
finally {
|
||||
segments.force();
|
||||
documents.force();
|
||||
segments.close();
|
||||
documents.close();
|
||||
}
|
||||
}
|
||||
|
||||
/** Transform the preindex into a reverse index */
|
||||
public void finalizeIndex(Path outputFileDocs, Path outputFileWords) throws IOException {
|
||||
var offsets = segments.counts;
|
||||
|
||||
Files.deleteIfExists(outputFileDocs);
|
||||
Files.deleteIfExists(outputFileWords);
|
||||
|
||||
// Estimate the size of the docs index data
|
||||
offsets.transformEach(0, offsets.size(), new CountToOffsetTransformer(2));
|
||||
IndexSizeEstimator sizeEstimator = new IndexSizeEstimator(ReverseIndexParameters.docsBTreeContext, 2);
|
||||
offsets.fold(0, 0, offsets.size(), sizeEstimator);
|
||||
|
||||
// Write the docs file
|
||||
LongArray finalDocs = LongArrayFactory.mmapForWritingConfined(outputFileDocs, sizeEstimator.size);
|
||||
try (var intermediateDocChannel = documents.createDocumentsFileChannel()) {
|
||||
offsets.transformEachIO(0, offsets.size(),
|
||||
new PrioIndexBTreeTransformer(finalDocs, 2,
|
||||
ReverseIndexParameters.docsBTreeContext,
|
||||
intermediateDocChannel));
|
||||
intermediateDocChannel.force(false);
|
||||
}
|
||||
|
||||
LongArray wordIds = segments.wordIds;
|
||||
|
||||
if (offsets.size() != wordIds.size())
|
||||
throw new IllegalStateException("Offsets and word-ids of different size");
|
||||
if (offsets.size() > Integer.MAX_VALUE) {
|
||||
throw new IllegalStateException("offsets.size() too big!");
|
||||
}
|
||||
|
||||
// Estimate the size of the words index data
|
||||
long wordsSize = ReverseIndexParameters.wordsBTreeContext.calculateSize((int) offsets.size());
|
||||
|
||||
// Construct the tree
|
||||
LongArray wordsArray = LongArrayFactory.mmapForWritingConfined(outputFileWords, wordsSize);
|
||||
|
||||
new BTreeWriter(wordsArray, ReverseIndexParameters.wordsBTreeContext)
|
||||
.write(0, (int) offsets.size(), mapRegion -> {
|
||||
for (long i = 0; i < offsets.size(); i++) {
|
||||
mapRegion.set(2*i, wordIds.get(i));
|
||||
mapRegion.set(2*i + 1, offsets.get(i));
|
||||
}
|
||||
});
|
||||
|
||||
finalDocs.force();
|
||||
finalDocs.close();
|
||||
wordsArray.force();
|
||||
wordsArray.close();
|
||||
|
||||
}
|
||||
|
||||
/** Delete all files associated with this pre-index */
|
||||
public void delete() throws IOException {
|
||||
segments.delete();
|
||||
documents.delete();
|
||||
}
|
||||
|
||||
public static PrioPreindex merge(Path destDir,
|
||||
PrioPreindex left,
|
||||
PrioPreindex right) throws IOException {
|
||||
|
||||
PrioPreindexWordSegments mergingSegment =
|
||||
createMergedSegmentWordFile(destDir, left.segments, right.segments);
|
||||
|
||||
var mergingIter = mergingSegment.constructionIterator(2);
|
||||
var leftIter = left.segments.iterator(2);
|
||||
var rightIter = right.segments.iterator(2);
|
||||
|
||||
Path docsFile = Files.createTempFile(destDir, "docs", ".dat");
|
||||
|
||||
LongArray mergedDocuments = LongArrayFactory.mmapForWritingConfined(docsFile, left.documents.size() + right.documents.size());
|
||||
|
||||
leftIter.next();
|
||||
rightIter.next();
|
||||
|
||||
try (FileChannel leftChannel = left.documents.createDocumentsFileChannel();
|
||||
FileChannel rightChannel = right.documents.createDocumentsFileChannel())
|
||||
{
|
||||
|
||||
while (mergingIter.canPutMore()
|
||||
&& leftIter.isPositionBeforeEnd()
|
||||
&& rightIter.isPositionBeforeEnd())
|
||||
{
|
||||
final long currentWord = mergingIter.wordId;
|
||||
|
||||
if (leftIter.wordId == currentWord && rightIter.wordId == currentWord)
|
||||
{
|
||||
// both inputs have documents for the current word
|
||||
mergeSegments(leftIter, rightIter,
|
||||
left.documents, right.documents,
|
||||
mergedDocuments, mergingIter);
|
||||
}
|
||||
else if (leftIter.wordId == currentWord) {
|
||||
if (!copySegment(leftIter, mergedDocuments, leftChannel, mergingIter))
|
||||
break;
|
||||
}
|
||||
else if (rightIter.wordId == currentWord) {
|
||||
if (!copySegment(rightIter, mergedDocuments, rightChannel, mergingIter))
|
||||
break;
|
||||
}
|
||||
else assert false : "This should never happen"; // the helvetica scenario
|
||||
}
|
||||
|
||||
if (leftIter.isPositionBeforeEnd()) {
|
||||
while (copySegment(leftIter, mergedDocuments, leftChannel, mergingIter));
|
||||
}
|
||||
|
||||
if (rightIter.isPositionBeforeEnd()) {
|
||||
while (copySegment(rightIter, mergedDocuments, rightChannel, mergingIter));
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
if (leftIter.isPositionBeforeEnd())
|
||||
throw new IllegalStateException("Left has more to go");
|
||||
if (rightIter.isPositionBeforeEnd())
|
||||
throw new IllegalStateException("Right has more to go");
|
||||
if (mergingIter.canPutMore())
|
||||
throw new IllegalStateException("Source iters ran dry before merging iter");
|
||||
|
||||
|
||||
mergingSegment.force();
|
||||
|
||||
// We may have overestimated the size of the merged docs size in the case there were
|
||||
// duplicates in the data, so we need to shrink it to the actual size we wrote.
|
||||
|
||||
mergedDocuments = shrinkMergedDocuments(mergedDocuments,
|
||||
docsFile, 2 * mergingSegment.totalSize());
|
||||
|
||||
return new PrioPreindex(
|
||||
mergingSegment,
|
||||
new PrioPreindexDocuments(mergedDocuments, docsFile)
|
||||
);
|
||||
}
|
||||
|
||||
/** Create a segment word file with each word from both inputs, with zero counts for all the data.
|
||||
* This is an intermediate product in merging.
|
||||
*/
|
||||
static PrioPreindexWordSegments createMergedSegmentWordFile(Path destDir,
|
||||
PrioPreindexWordSegments left,
|
||||
PrioPreindexWordSegments right) throws IOException {
|
||||
Path segmentWordsFile = Files.createTempFile(destDir, "segment_words", ".dat");
|
||||
Path segmentCountsFile = Files.createTempFile(destDir, "segment_counts", ".dat");
|
||||
|
||||
// We need total size to request a direct LongArray range. Seems slower, but is faster.
|
||||
// ... see LongArray.directRangeIfPossible(long start, long end)
|
||||
long segmentsSize = countDistinctElements(left.wordIds, right.wordIds,
|
||||
0, left.wordIds.size(),
|
||||
0, right.wordIds.size());
|
||||
|
||||
LongArray wordIdsFile = LongArrayFactory.mmapForWritingConfined(segmentWordsFile, segmentsSize);
|
||||
|
||||
mergeArrays(wordIdsFile, left.wordIds, right.wordIds,
|
||||
0,
|
||||
0, left.wordIds.size(),
|
||||
0, right.wordIds.size());
|
||||
|
||||
LongArray counts = LongArrayFactory.mmapForWritingConfined(segmentCountsFile, segmentsSize);
|
||||
|
||||
return new PrioPreindexWordSegments(wordIdsFile, counts, segmentWordsFile, segmentCountsFile);
|
||||
}
|
||||
|
||||
/** It's possible we overestimated the necessary size of the documents file,
|
||||
* this will permit us to shrink it down to the smallest necessary size.
|
||||
*/
|
||||
private static LongArray shrinkMergedDocuments(LongArray mergedDocuments, Path docsFile, long sizeLongs) throws IOException {
|
||||
|
||||
mergedDocuments.force();
|
||||
|
||||
long beforeSize = mergedDocuments.size();
|
||||
long afterSize = sizeLongs * 8;
|
||||
if (beforeSize != afterSize) {
|
||||
mergedDocuments.close();
|
||||
try (var bc = Files.newByteChannel(docsFile, StandardOpenOption.WRITE)) {
|
||||
bc.truncate(sizeLongs * 8);
|
||||
}
|
||||
|
||||
logger.info("Shrunk {} from {}b to {}b", docsFile, beforeSize, afterSize);
|
||||
mergedDocuments = LongArrayFactory.mmapForWritingConfined(docsFile, sizeLongs);
|
||||
}
|
||||
|
||||
return mergedDocuments;
|
||||
}
|
||||
|
||||
/** Merge contents of the segments indicated by leftIter and rightIter into the destionation
|
||||
* segment, and advance the construction iterator with the appropriate size.
|
||||
*/
|
||||
private static void mergeSegments(PrioPreindexWordSegments.SegmentIterator leftIter,
|
||||
PrioPreindexWordSegments.SegmentIterator rightIter,
|
||||
PrioPreindexDocuments left,
|
||||
PrioPreindexDocuments right,
|
||||
LongArray dest,
|
||||
PrioPreindexWordSegments.SegmentConstructionIterator destIter)
|
||||
{
|
||||
long segSize = mergeArrays2(dest,
|
||||
left.documents,
|
||||
right.documents,
|
||||
destIter.startOffset,
|
||||
leftIter.startOffset, leftIter.endOffset,
|
||||
rightIter.startOffset, rightIter.endOffset);
|
||||
|
||||
long distinct = segSize / 2;
|
||||
destIter.putNext(distinct);
|
||||
leftIter.next();
|
||||
rightIter.next();
|
||||
}
|
||||
|
||||
/** Copy the data from the source segment at the position and length indicated by sourceIter,
|
||||
* into the destination segment, and advance the construction iterator.
|
||||
*/
|
||||
private static boolean copySegment(PrioPreindexWordSegments.SegmentIterator sourceIter,
|
||||
LongArray dest,
|
||||
FileChannel sourceChannel,
|
||||
PrioPreindexWordSegments.SegmentConstructionIterator mergingIter) throws IOException {
|
||||
|
||||
long size = sourceIter.endOffset - sourceIter.startOffset;
|
||||
long start = mergingIter.startOffset;
|
||||
long end = start + size;
|
||||
|
||||
dest.transferFrom(sourceChannel,
|
||||
sourceIter.startOffset,
|
||||
mergingIter.startOffset,
|
||||
end);
|
||||
|
||||
boolean putNext = mergingIter.putNext(size / 2);
|
||||
boolean iterNext = sourceIter.next();
|
||||
|
||||
if (!putNext && iterNext)
|
||||
throw new IllegalStateException("Source iterator ran out before dest iterator?!");
|
||||
|
||||
return iterNext;
|
||||
}
|
||||
|
||||
|
||||
}
|
@ -0,0 +1,141 @@
|
||||
package nu.marginalia.index.construction.prio;
|
||||
|
||||
import lombok.SneakyThrows;
|
||||
import nu.marginalia.array.LongArray;
|
||||
import nu.marginalia.array.LongArrayFactory;
|
||||
import nu.marginalia.index.construction.DocIdRewriter;
|
||||
import nu.marginalia.index.construction.PositionsFileConstructor;
|
||||
import nu.marginalia.index.journal.reader.IndexJournalReader;
|
||||
import nu.marginalia.rwf.RandomFileAssembler;
|
||||
import org.slf4j.Logger;
|
||||
import org.slf4j.LoggerFactory;
|
||||
|
||||
import java.io.IOException;
|
||||
import java.nio.channels.FileChannel;
|
||||
import java.nio.file.Files;
|
||||
import java.nio.file.Path;
|
||||
import java.nio.file.StandardOpenOption;
|
||||
import java.util.concurrent.ExecutorService;
|
||||
import java.util.concurrent.Executors;
|
||||
import java.util.concurrent.TimeUnit;
|
||||
|
||||
/** A LongArray with document data, segmented according to
|
||||
* the associated ReversePreindexWordSegments data
|
||||
*/
|
||||
public class PrioPreindexDocuments {
|
||||
public final LongArray documents;
|
||||
|
||||
private static PositionsFileConstructor positionsFileConstructor;
|
||||
private static final int RECORD_SIZE_LONGS = 2;
|
||||
private static final Logger logger = LoggerFactory.getLogger(PrioPreindexDocuments.class);
|
||||
|
||||
public final Path file;
|
||||
|
||||
public PrioPreindexDocuments(LongArray documents, Path file) {
|
||||
this.documents = documents;
|
||||
this.file = file;
|
||||
}
|
||||
|
||||
public static PrioPreindexDocuments construct(
|
||||
Path docsFile,
|
||||
Path workDir,
|
||||
IndexJournalReader reader,
|
||||
DocIdRewriter docIdRewriter,
|
||||
PositionsFileConstructor positionsFileConstructor,
|
||||
PrioPreindexWordSegments segments) throws IOException {
|
||||
PrioPreindexDocuments.positionsFileConstructor = positionsFileConstructor;
|
||||
|
||||
createUnsortedDocsFile(docsFile, workDir, reader, segments, docIdRewriter);
|
||||
|
||||
LongArray docsFileMap = LongArrayFactory.mmapForModifyingShared(docsFile);
|
||||
sortDocsFile(docsFileMap, segments);
|
||||
|
||||
return new PrioPreindexDocuments(docsFileMap, docsFile);
|
||||
}
|
||||
|
||||
public FileChannel createDocumentsFileChannel() throws IOException {
|
||||
return (FileChannel) Files.newByteChannel(file, StandardOpenOption.READ);
|
||||
}
|
||||
|
||||
|
||||
public LongArray slice(long start, long end) {
|
||||
return documents.range(start, end);
|
||||
}
|
||||
|
||||
public long size() {
|
||||
return documents.size();
|
||||
}
|
||||
|
||||
private static void createUnsortedDocsFile(Path docsFile,
|
||||
Path workDir,
|
||||
IndexJournalReader reader,
|
||||
PrioPreindexWordSegments segments,
|
||||
DocIdRewriter docIdRewriter) throws IOException {
|
||||
|
||||
long fileSizeLongs = RECORD_SIZE_LONGS * segments.totalSize();
|
||||
|
||||
try (var assembly = RandomFileAssembler.create(workDir, fileSizeLongs);
|
||||
var pointer = reader.newPointer())
|
||||
{
|
||||
|
||||
var offsetMap = segments.asMap(RECORD_SIZE_LONGS);
|
||||
offsetMap.defaultReturnValue(0);
|
||||
|
||||
while (pointer.nextDocument()) {
|
||||
long rankEncodedId = docIdRewriter.rewriteDocId(pointer.documentId());
|
||||
for (var termData : pointer) {
|
||||
long termId = termData.termId();
|
||||
|
||||
long offset = offsetMap.addTo(termId, RECORD_SIZE_LONGS);
|
||||
|
||||
// write position data to the positions file and get the offset
|
||||
long encodedPosOffset = positionsFileConstructor.add((byte) termData.metadata(), termData.positionsBuffer());
|
||||
|
||||
assembly.put(offset + 0, rankEncodedId);
|
||||
assembly.put(offset + 1, encodedPosOffset);
|
||||
}
|
||||
}
|
||||
|
||||
assembly.write(docsFile);
|
||||
}
|
||||
}
|
||||
|
||||
@SneakyThrows
|
||||
private static void sortDocsFile(LongArray docsFileMap, PrioPreindexWordSegments segments) throws IOException {
|
||||
|
||||
var iter = segments.iterator(RECORD_SIZE_LONGS);
|
||||
|
||||
ExecutorService sortingWorkers = Executors.newWorkStealingPool(Runtime.getRuntime().availableProcessors());
|
||||
|
||||
while (iter.next()) {
|
||||
long iterStart = iter.startOffset;
|
||||
long iterEnd = iter.endOffset;
|
||||
|
||||
if (iter.size() < 1024) {
|
||||
docsFileMap.quickSortN(RECORD_SIZE_LONGS, iterStart, iterEnd);
|
||||
}
|
||||
else {
|
||||
sortingWorkers.execute(() ->
|
||||
docsFileMap.quickSortN(RECORD_SIZE_LONGS, iterStart, iterEnd));
|
||||
}
|
||||
}
|
||||
|
||||
sortingWorkers.shutdown();
|
||||
while (!sortingWorkers.awaitTermination(1, TimeUnit.HOURS));
|
||||
|
||||
sortingWorkers.close();
|
||||
}
|
||||
|
||||
public void delete() throws IOException {
|
||||
Files.delete(this.file);
|
||||
documents.close();
|
||||
}
|
||||
|
||||
public void close() {
|
||||
documents.close();
|
||||
}
|
||||
|
||||
public void force() {
|
||||
documents.force();
|
||||
}
|
||||
}
|
@ -0,0 +1,36 @@
|
||||
package nu.marginalia.index.construction.prio;
|
||||
|
||||
import nu.marginalia.array.LongArrayFactory;
|
||||
|
||||
import java.io.IOException;
|
||||
import java.nio.file.Path;
|
||||
|
||||
/** This is a dehydrated version of a PrioPreIndex, that only
|
||||
* keeps references to its location on disk but does not hold associated
|
||||
* memory maps.
|
||||
*/
|
||||
public record PrioPreindexReference(
|
||||
Path wordsFile,
|
||||
Path countsFile,
|
||||
Path documentsFile
|
||||
)
|
||||
{
|
||||
public PrioPreindexReference(PrioPreindexWordSegments segments, PrioPreindexDocuments documents) {
|
||||
this(segments.wordsFile, segments.countsFile, documents.file);
|
||||
}
|
||||
|
||||
public PrioPreindex open() throws IOException {
|
||||
return new PrioPreindex(
|
||||
new PrioPreindexWordSegments(
|
||||
LongArrayFactory.mmapForModifyingShared(wordsFile),
|
||||
LongArrayFactory.mmapForModifyingShared(countsFile),
|
||||
wordsFile,
|
||||
countsFile
|
||||
),
|
||||
new PrioPreindexDocuments(
|
||||
LongArrayFactory.mmapForModifyingShared(documentsFile),
|
||||
documentsFile
|
||||
)
|
||||
);
|
||||
}
|
||||
}
|
@ -0,0 +1,205 @@
|
||||
package nu.marginalia.index.construction.prio;
|
||||
|
||||
import it.unimi.dsi.fastutil.longs.Long2IntOpenHashMap;
|
||||
import it.unimi.dsi.fastutil.longs.Long2LongOpenHashMap;
|
||||
import it.unimi.dsi.fastutil.longs.LongIterator;
|
||||
import nu.marginalia.array.LongArray;
|
||||
import nu.marginalia.array.LongArrayFactory;
|
||||
import nu.marginalia.index.journal.reader.IndexJournalReader;
|
||||
|
||||
import java.io.IOException;
|
||||
import java.nio.file.Files;
|
||||
import java.nio.file.Path;
|
||||
|
||||
/** A pair of file-backed arrays of sorted wordIds
|
||||
* and the count of documents associated with each termId.
|
||||
*/
|
||||
public class PrioPreindexWordSegments {
|
||||
public final LongArray wordIds;
|
||||
public final LongArray counts;
|
||||
|
||||
final Path wordsFile;
|
||||
final Path countsFile;
|
||||
|
||||
public PrioPreindexWordSegments(LongArray wordIds,
|
||||
LongArray counts,
|
||||
Path wordsFile,
|
||||
Path countsFile)
|
||||
{
|
||||
assert wordIds.size() == counts.size();
|
||||
|
||||
this.wordIds = wordIds;
|
||||
this.counts = counts;
|
||||
this.wordsFile = wordsFile;
|
||||
this.countsFile = countsFile;
|
||||
}
|
||||
|
||||
/** Returns a long-long hash map where each key is a termId,
|
||||
* and each value is the start offset of the data.
|
||||
*/
|
||||
public Long2LongOpenHashMap asMap(int recordSize) {
|
||||
if (wordIds.size() > Integer.MAX_VALUE)
|
||||
throw new IllegalArgumentException("Cannot create a map with more than Integer.MAX_VALUE entries");
|
||||
|
||||
Long2LongOpenHashMap ret = new Long2LongOpenHashMap((int) wordIds.size(), 0.75f);
|
||||
var iter = iterator(recordSize);
|
||||
|
||||
while (iter.next()) {
|
||||
ret.put(iter.wordId, iter.startOffset);
|
||||
}
|
||||
|
||||
return ret;
|
||||
}
|
||||
|
||||
public static PrioPreindexWordSegments construct(IndexJournalReader reader,
|
||||
Path wordIdsFile,
|
||||
Path countsFile)
|
||||
throws IOException
|
||||
{
|
||||
Long2IntOpenHashMap countsMap = new Long2IntOpenHashMap(100_000, 0.75f);
|
||||
countsMap.defaultReturnValue(0);
|
||||
reader.forEachWordId(wordId -> countsMap.addTo(wordId, 1));
|
||||
|
||||
LongArray words = LongArrayFactory.mmapForWritingConfined(wordIdsFile, countsMap.size());
|
||||
LongArray counts = LongArrayFactory.mmapForWritingConfined(countsFile, countsMap.size());
|
||||
|
||||
// Create the words file by iterating over the map and inserting them into
|
||||
// the words file in whatever bizarro hash table order they appear in
|
||||
long i = 0;
|
||||
LongIterator iter = countsMap.keySet().iterator();
|
||||
while (iter.hasNext()) {
|
||||
words.set(i++, iter.nextLong());
|
||||
}
|
||||
|
||||
// Sort the words file
|
||||
words.sort(0, counts.size());
|
||||
|
||||
// Populate the counts
|
||||
for (i = 0; i < countsMap.size(); i++) {
|
||||
counts.set(i, countsMap.get(words.get(i)));
|
||||
}
|
||||
|
||||
return new PrioPreindexWordSegments(words, counts, wordIdsFile, countsFile);
|
||||
}
|
||||
|
||||
public SegmentIterator iterator(int recordSize) {
|
||||
return new SegmentIterator(recordSize);
|
||||
}
|
||||
public SegmentConstructionIterator constructionIterator(int recordSize) {
|
||||
return new SegmentConstructionIterator(recordSize);
|
||||
}
|
||||
|
||||
public long totalSize() {
|
||||
return counts.fold(0, 0, counts.size(), Long::sum);
|
||||
}
|
||||
|
||||
public void delete() throws IOException {
|
||||
Files.delete(countsFile);
|
||||
Files.delete(wordsFile);
|
||||
|
||||
counts.close();
|
||||
wordIds.close();
|
||||
}
|
||||
|
||||
public void force() {
|
||||
counts.force();
|
||||
wordIds.force();
|
||||
}
|
||||
|
||||
public void close() {
|
||||
wordIds.close();
|
||||
counts.close();
|
||||
}
|
||||
|
||||
public class SegmentIterator {
|
||||
private final int recordSize;
|
||||
private final long fileSize;
|
||||
long wordId;
|
||||
long startOffset = 0;
|
||||
long endOffset = 0;
|
||||
|
||||
private SegmentIterator(int recordSize) {
|
||||
this.recordSize = recordSize;
|
||||
this.fileSize = wordIds.size();
|
||||
}
|
||||
|
||||
private long i = -1;
|
||||
public long idx() {
|
||||
return i;
|
||||
}
|
||||
public boolean next() {
|
||||
if (++i >= fileSize) {
|
||||
wordId = Long.MIN_VALUE;
|
||||
return false;
|
||||
}
|
||||
|
||||
wordId = wordIds.get(i);
|
||||
startOffset = endOffset;
|
||||
endOffset = startOffset + recordSize * counts.get(i);
|
||||
|
||||
return true;
|
||||
}
|
||||
|
||||
public boolean hasMorePositions() {
|
||||
return i + 1 < wordIds.size();
|
||||
}
|
||||
|
||||
public boolean isPositionBeforeEnd() {
|
||||
return i < wordIds.size();
|
||||
}
|
||||
|
||||
public long size() {
|
||||
return endOffset - startOffset;
|
||||
}
|
||||
}
|
||||
|
||||
class SegmentConstructionIterator {
|
||||
private final int recordSize;
|
||||
private final long fileSize;
|
||||
long wordId;
|
||||
long startOffset = 0;
|
||||
long endOffset = 0;
|
||||
|
||||
private SegmentConstructionIterator(int recordSize) {
|
||||
this.recordSize = recordSize;
|
||||
this.fileSize = wordIds.size();
|
||||
if (fileSize == 0) {
|
||||
throw new IllegalArgumentException("Cannot construct zero-length word segment file");
|
||||
}
|
||||
this.wordId = wordIds.get(0);
|
||||
}
|
||||
|
||||
private long i = 0;
|
||||
public long idx() {
|
||||
return i;
|
||||
}
|
||||
|
||||
public boolean putNext(long size) {
|
||||
|
||||
if (i >= fileSize)
|
||||
return false;
|
||||
|
||||
endOffset = startOffset + recordSize * size;
|
||||
counts.set(i, size);
|
||||
startOffset = endOffset;
|
||||
endOffset = -1;
|
||||
|
||||
i++;
|
||||
|
||||
if (i == fileSize) {
|
||||
// We've reached the end of the iteration and there is no
|
||||
// "next" termId to fetch
|
||||
wordId = Long.MIN_VALUE;
|
||||
return false;
|
||||
}
|
||||
else {
|
||||
wordId = wordIds.get(i);
|
||||
return true;
|
||||
}
|
||||
}
|
||||
|
||||
public boolean canPutMore() {
|
||||
return i < wordIds.size();
|
||||
}
|
||||
}
|
||||
}
|
@ -4,9 +4,9 @@ import it.unimi.dsi.fastutil.ints.IntList;
|
||||
import nu.marginalia.array.page.LongQueryBuffer;
|
||||
import nu.marginalia.index.construction.DocIdRewriter;
|
||||
import nu.marginalia.index.construction.PositionsFileConstructor;
|
||||
import nu.marginalia.index.construction.ReversePreindex;
|
||||
import nu.marginalia.index.construction.TestJournalFactory;
|
||||
import nu.marginalia.index.construction.TestJournalFactory.EntryDataWithWordMeta;
|
||||
import nu.marginalia.index.construction.full.FullPreindex;
|
||||
import nu.marginalia.index.construction.full.TestJournalFactory;
|
||||
import nu.marginalia.index.construction.full.TestJournalFactory.EntryDataWithWordMeta;
|
||||
import nu.marginalia.index.positions.PositionsFileReader;
|
||||
import org.junit.jupiter.api.AfterEach;
|
||||
import org.junit.jupiter.api.BeforeEach;
|
||||
@ -19,7 +19,7 @@ import java.nio.file.Path;
|
||||
import java.util.ArrayList;
|
||||
import java.util.List;
|
||||
|
||||
import static nu.marginalia.index.construction.TestJournalFactory.wm;
|
||||
import static nu.marginalia.index.construction.full.TestJournalFactory.wm;
|
||||
import static org.junit.jupiter.api.Assertions.*;
|
||||
|
||||
class ReverseIndexReaderTest {
|
||||
@ -99,7 +99,7 @@ class ReverseIndexReaderTest {
|
||||
Path wordsFile = tempDir.resolve("words.dat");
|
||||
|
||||
try (var positionsFileConstructor = new PositionsFileConstructor(posFile)) {
|
||||
var preindex = ReversePreindex.constructPreindex(reader,
|
||||
var preindex = FullPreindex.constructPreindex(reader,
|
||||
positionsFileConstructor,
|
||||
DocIdRewriter.identity(), tempDir);
|
||||
preindex.finalizeIndex(docsFile, wordsFile);
|
||||
|
@ -1,5 +1,7 @@
|
||||
package nu.marginalia.index.construction;
|
||||
package nu.marginalia.index.construction.full;
|
||||
|
||||
import nu.marginalia.index.construction.DocIdRewriter;
|
||||
import nu.marginalia.index.construction.PositionsFileConstructor;
|
||||
import org.junit.jupiter.api.AfterEach;
|
||||
import org.junit.jupiter.api.BeforeEach;
|
||||
import org.junit.jupiter.api.Test;
|
||||
@ -11,10 +13,10 @@ import java.util.ArrayList;
|
||||
import java.util.Arrays;
|
||||
import java.util.List;
|
||||
|
||||
import static nu.marginalia.index.construction.TestJournalFactory.EntryData;
|
||||
import static nu.marginalia.index.construction.full.TestJournalFactory.EntryData;
|
||||
import static org.junit.jupiter.api.Assertions.assertEquals;
|
||||
|
||||
class ReversePreindexDocsTest {
|
||||
class FullPreindexDocsTest {
|
||||
Path countsFile;
|
||||
Path wordsIdFile;
|
||||
Path docsFile;
|
||||
@ -57,8 +59,8 @@ class ReversePreindexDocsTest {
|
||||
new EntryData(-0xF00BA3L, 0, 10, 40, -100, 33)
|
||||
);
|
||||
|
||||
var segments = ReversePreindexWordSegments.construct(reader, wordsIdFile, countsFile);
|
||||
var docs = ReversePreindexDocuments.construct(docsFile, tempDir, reader, DocIdRewriter.identity(), new PositionsFileConstructor(positionsFile), segments);
|
||||
var segments = FullPreindexWordSegments.construct(reader, wordsIdFile, countsFile);
|
||||
var docs = FullPreindexDocuments.construct(docsFile, tempDir, reader, DocIdRewriter.identity(), new PositionsFileConstructor(positionsFile), segments);
|
||||
|
||||
List<TestSegmentData> expected = List.of(
|
||||
new TestSegmentData(-100, 0, 2, new long[] { -0xF00BA3L, 0 }),
|
||||
@ -86,8 +88,8 @@ class ReversePreindexDocsTest {
|
||||
new EntryData(-0xF00BA3L, 0, 4, 4)
|
||||
);
|
||||
|
||||
var segments = ReversePreindexWordSegments.construct(reader, wordsIdFile, countsFile);
|
||||
var docs = ReversePreindexDocuments.construct(docsFile, tempDir, reader, DocIdRewriter.identity(),
|
||||
var segments = FullPreindexWordSegments.construct(reader, wordsIdFile, countsFile);
|
||||
var docs = FullPreindexDocuments.construct(docsFile, tempDir, reader, DocIdRewriter.identity(),
|
||||
new PositionsFileConstructor(positionsFile),
|
||||
segments);
|
||||
|
||||
@ -115,8 +117,8 @@ class ReversePreindexDocsTest {
|
||||
new EntryData(0xF00BA4L, 0, 15, 30, -100, 33)
|
||||
);
|
||||
|
||||
var segments = ReversePreindexWordSegments.construct(reader, wordsIdFile, countsFile);
|
||||
var docs = ReversePreindexDocuments.construct(docsFile, tempDir, reader, DocIdRewriter.identity(),
|
||||
var segments = FullPreindexWordSegments.construct(reader, wordsIdFile, countsFile);
|
||||
var docs = FullPreindexDocuments.construct(docsFile, tempDir, reader, DocIdRewriter.identity(),
|
||||
new PositionsFileConstructor(positionsFile),
|
||||
segments);
|
||||
|
@ -1,8 +1,10 @@
|
||||
|
||||
package nu.marginalia.index.construction;
|
||||
package nu.marginalia.index.construction.full;
|
||||
|
||||
import nu.marginalia.array.LongArrayFactory;
|
||||
import nu.marginalia.btree.model.BTreeHeader;
|
||||
import nu.marginalia.index.construction.DocIdRewriter;
|
||||
import nu.marginalia.index.construction.PositionsFileConstructor;
|
||||
import org.junit.jupiter.api.AfterEach;
|
||||
import org.junit.jupiter.api.BeforeEach;
|
||||
import org.junit.jupiter.api.Test;
|
||||
@ -12,11 +14,11 @@ import java.nio.file.Files;
|
||||
import java.nio.file.Path;
|
||||
import java.util.*;
|
||||
|
||||
import static nu.marginalia.index.construction.TestJournalFactory.*;
|
||||
import static nu.marginalia.index.construction.full.TestJournalFactory.*;
|
||||
import static org.junit.jupiter.api.Assertions.assertEquals;
|
||||
import static org.junit.jupiter.api.Assertions.assertTrue;
|
||||
|
||||
class ReversePreindexFinalizeTest {
|
||||
class FullPreindexFinalizeTest {
|
||||
TestJournalFactory journalFactory;
|
||||
Path positionsFile;
|
||||
Path countsFile;
|
||||
@ -52,7 +54,7 @@ class ReversePreindexFinalizeTest {
|
||||
@Test
|
||||
public void testFinalizeSimple() throws IOException {
|
||||
var reader = journalFactory.createReader(new EntryDataWithWordMeta(100, 101, wm(50, 51)));
|
||||
var preindex = ReversePreindex.constructPreindex(reader,
|
||||
var preindex = FullPreindex.constructPreindex(reader,
|
||||
new PositionsFileConstructor(positionsFile),
|
||||
DocIdRewriter.identity(), tempDir);
|
||||
|
||||
@ -90,7 +92,7 @@ class ReversePreindexFinalizeTest {
|
||||
new EntryDataWithWordMeta(101, 101, wm(51, 52))
|
||||
);
|
||||
|
||||
var preindex = ReversePreindex.constructPreindex(reader,
|
||||
var preindex = FullPreindex.constructPreindex(reader,
|
||||
new PositionsFileConstructor(positionsFile),
|
||||
DocIdRewriter.identity(), tempDir);
|
||||
|
@ -1,6 +1,8 @@
|
||||
|
||||
package nu.marginalia.index.construction;
|
||||
package nu.marginalia.index.construction.full;
|
||||
|
||||
import nu.marginalia.index.construction.DocIdRewriter;
|
||||
import nu.marginalia.index.construction.PositionsFileConstructor;
|
||||
import org.junit.jupiter.api.AfterEach;
|
||||
import org.junit.jupiter.api.BeforeEach;
|
||||
import org.junit.jupiter.api.Test;
|
||||
@ -10,10 +12,10 @@ import java.nio.file.Files;
|
||||
import java.nio.file.Path;
|
||||
import java.util.*;
|
||||
|
||||
import static nu.marginalia.index.construction.TestJournalFactory.*;
|
||||
import static nu.marginalia.index.construction.full.TestJournalFactory.*;
|
||||
import static org.junit.jupiter.api.Assertions.assertEquals;
|
||||
|
||||
class ReversePreindexMergeTest {
|
||||
class FullPreindexMergeTest {
|
||||
TestJournalFactory journalFactory;
|
||||
Path countsFile;
|
||||
Path wordsIdFile;
|
||||
@ -46,19 +48,19 @@ class ReversePreindexMergeTest {
|
||||
Files.delete(tempDir);
|
||||
}
|
||||
|
||||
public ReversePreindex runMergeScenario(
|
||||
public FullPreindex runMergeScenario(
|
||||
List<EntryDataWithWordMeta> leftData,
|
||||
List<EntryDataWithWordMeta> rightData
|
||||
) throws IOException {
|
||||
var reader1 = journalFactory.createReader(leftData.toArray(EntryDataWithWordMeta[]::new));
|
||||
var reader2 = journalFactory.createReader(rightData.toArray(EntryDataWithWordMeta[]::new));
|
||||
|
||||
var left = ReversePreindex.constructPreindex(reader1, new PositionsFileConstructor(positionsFile), DocIdRewriter.identity(), tempDir);
|
||||
var right = ReversePreindex.constructPreindex(reader2, new PositionsFileConstructor(positionsFile), DocIdRewriter.identity(), tempDir);
|
||||
return ReversePreindex.merge(tempDir, left, right);
|
||||
var left = FullPreindex.constructPreindex(reader1, new PositionsFileConstructor(positionsFile), DocIdRewriter.identity(), tempDir);
|
||||
var right = FullPreindex.constructPreindex(reader2, new PositionsFileConstructor(positionsFile), DocIdRewriter.identity(), tempDir);
|
||||
return FullPreindex.merge(tempDir, left, right);
|
||||
}
|
||||
|
||||
private List<TestSegmentData> getData(ReversePreindex merged) {
|
||||
private List<TestSegmentData> getData(FullPreindex merged) {
|
||||
var iter = merged.segments.iterator(2);
|
||||
List<TestSegmentData> actual = new ArrayList<>();
|
||||
while (iter.next()) {
|
@ -1,4 +1,4 @@
|
||||
package nu.marginalia.index.construction;
|
||||
package nu.marginalia.index.construction.full;
|
||||
|
||||
import nu.marginalia.array.LongArray;
|
||||
import org.junit.jupiter.api.AfterEach;
|
||||
@ -11,10 +11,10 @@ import java.nio.file.Path;
|
||||
import java.util.ArrayList;
|
||||
import java.util.List;
|
||||
|
||||
import static nu.marginalia.index.construction.TestJournalFactory.*;
|
||||
import static nu.marginalia.index.construction.full.TestJournalFactory.*;
|
||||
import static org.junit.jupiter.api.Assertions.*;
|
||||
|
||||
class ReversePreindexWordSegmentsTest {
|
||||
class FullPreindexWordSegmentsTest {
|
||||
Path countsFile;
|
||||
Path wordsIdFile;
|
||||
Path docsFile;
|
||||
@ -51,7 +51,7 @@ class ReversePreindexWordSegmentsTest {
|
||||
new EntryData(-0xF00BA3L, 0, 1L<<33)
|
||||
);
|
||||
|
||||
var segments = ReversePreindexWordSegments.construct(reader, wordsIdFile, countsFile);
|
||||
var segments = FullPreindexWordSegments.construct(reader, wordsIdFile, countsFile);
|
||||
var iter = segments.iterator(1);
|
||||
|
||||
List<TestSegmentData> expected = List.of(
|
||||
@ -72,7 +72,7 @@ class ReversePreindexWordSegmentsTest {
|
||||
new EntryData(-0xF00BA3L, 0, 5, 5)
|
||||
);
|
||||
|
||||
var segments = ReversePreindexWordSegments.construct(reader, wordsIdFile, countsFile);
|
||||
var segments = FullPreindexWordSegments.construct(reader, wordsIdFile, countsFile);
|
||||
var iter = segments.iterator(1);
|
||||
|
||||
List<TestSegmentData> expected = List.of(
|
||||
@ -94,7 +94,7 @@ class ReversePreindexWordSegmentsTest {
|
||||
new EntryData(-0xF00BA3L, 0, 10, 40, -100, 33)
|
||||
);
|
||||
|
||||
var segments = ReversePreindexWordSegments.construct(reader, wordsIdFile, countsFile);
|
||||
var segments = FullPreindexWordSegments.construct(reader, wordsIdFile, countsFile);
|
||||
var iter = segments.iterator(1);
|
||||
|
||||
List<TestSegmentData> expected = List.of(
|
||||
@ -120,7 +120,7 @@ class ReversePreindexWordSegmentsTest {
|
||||
new EntryData(0xF00BA4L, 0, 15, 30, -100, 33)
|
||||
);
|
||||
|
||||
var segments = ReversePreindexWordSegments.construct(reader, wordsIdFile, countsFile);
|
||||
var segments = FullPreindexWordSegments.construct(reader, wordsIdFile, countsFile);
|
||||
var iter = segments.iterator(1);
|
||||
|
||||
List<TestSegmentData> expected = List.of(
|
||||
@ -148,7 +148,7 @@ class ReversePreindexWordSegmentsTest {
|
||||
LongArray countsArray = LongArray.allocate(4);
|
||||
wordsArray.set(0, -1, -2, -3, -4);
|
||||
countsArray.set(0, 2, 1, 3, 5);
|
||||
var segments = new ReversePreindexWordSegments(wordsArray, countsArray, null, null);
|
||||
var segments = new FullPreindexWordSegments(wordsArray, countsArray, null, null);
|
||||
|
||||
var ritr = segments.iterator(1);
|
||||
assertTrue(ritr.hasMorePositions());
|
||||
@ -196,7 +196,7 @@ class ReversePreindexWordSegmentsTest {
|
||||
LongArray wordsArray = LongArray.allocate(4);
|
||||
LongArray countsArray = LongArray.allocate(4);
|
||||
wordsArray.set(0, -1, -2, -3, -4);
|
||||
var segments = new ReversePreindexWordSegments(wordsArray, countsArray, null, null);
|
||||
var segments = new FullPreindexWordSegments(wordsArray, countsArray, null, null);
|
||||
|
||||
var citr = segments.constructionIterator(1);
|
||||
assertEquals(-1, citr.wordId);
|
@ -1,4 +1,4 @@
|
||||
package nu.marginalia.index.construction;
|
||||
package nu.marginalia.index.construction.full;
|
||||
|
||||
import nu.marginalia.index.journal.model.IndexJournalEntryData;
|
||||
import nu.marginalia.index.journal.model.IndexJournalEntryHeader;
|
@ -1,4 +1,4 @@
|
||||
package nu.marginalia.index.construction;
|
||||
package nu.marginalia.index.construction.full;
|
||||
|
||||
import java.util.Arrays;
|
||||
|
@ -3,13 +3,11 @@ package nu.marginalia.index;
|
||||
import com.google.inject.Guice;
|
||||
import com.google.inject.Inject;
|
||||
import it.unimi.dsi.fastutil.ints.IntList;
|
||||
import it.unimi.dsi.fastutil.longs.LongArrayList;
|
||||
import it.unimi.dsi.fastutil.longs.LongList;
|
||||
import nu.marginalia.IndexLocations;
|
||||
import nu.marginalia.array.page.LongQueryBuffer;
|
||||
import nu.marginalia.hash.MurmurHash3_128;
|
||||
import nu.marginalia.index.construction.DocIdRewriter;
|
||||
import nu.marginalia.index.construction.ReverseIndexConstructor;
|
||||
import nu.marginalia.index.construction.full.FullIndexConstructor;
|
||||
import nu.marginalia.index.domainrankings.DomainRankings;
|
||||
import nu.marginalia.index.forward.ForwardIndexConverter;
|
||||
import nu.marginalia.index.forward.ForwardIndexFileNames;
|
||||
@ -33,7 +31,6 @@ import nu.marginalia.model.idx.WordMetadata;
|
||||
import nu.marginalia.process.control.FakeProcessHeartbeat;
|
||||
import nu.marginalia.process.control.ProcessHeartbeat;
|
||||
import nu.marginalia.sequence.GammaCodedSequence;
|
||||
import nu.marginalia.service.control.ServiceHeartbeat;
|
||||
import nu.marginalia.service.server.Initialization;
|
||||
import nu.marginalia.storage.FileStorageService;
|
||||
import org.junit.jupiter.api.AfterEach;
|
||||
@ -247,7 +244,7 @@ public class CombinedIndexReaderTest {
|
||||
if (!Files.isDirectory(tmpDir)) Files.createDirectories(tmpDir);
|
||||
|
||||
var constructor =
|
||||
new ReverseIndexConstructor(
|
||||
new FullIndexConstructor(
|
||||
outputFileDocs,
|
||||
outputFileWords,
|
||||
outputFilePositions,
|
||||
@ -267,7 +264,7 @@ public class CombinedIndexReaderTest {
|
||||
|
||||
if (!Files.isDirectory(tmpDir)) Files.createDirectories(tmpDir);
|
||||
|
||||
var constructor = new ReverseIndexConstructor(
|
||||
var constructor = new FullIndexConstructor(
|
||||
outputFileDocs,
|
||||
outputFileWords,
|
||||
outputFilePositions,
|
||||
|
@ -14,7 +14,7 @@ import nu.marginalia.process.control.ProcessHeartbeat;
|
||||
import nu.marginalia.sequence.GammaCodedSequence;
|
||||
import nu.marginalia.storage.FileStorageService;
|
||||
import nu.marginalia.index.construction.DocIdRewriter;
|
||||
import nu.marginalia.index.construction.ReverseIndexConstructor;
|
||||
import nu.marginalia.index.construction.full.FullIndexConstructor;
|
||||
import nu.marginalia.index.forward.ForwardIndexConverter;
|
||||
import nu.marginalia.index.forward.ForwardIndexFileNames;
|
||||
import nu.marginalia.index.journal.model.IndexJournalEntryHeader;
|
||||
@ -291,7 +291,7 @@ public class IndexQueryServiceIntegrationSmokeTest {
|
||||
|
||||
if (!Files.isDirectory(tmpDir)) Files.createDirectories(tmpDir);
|
||||
|
||||
var constructor = new ReverseIndexConstructor(
|
||||
var constructor = new FullIndexConstructor(
|
||||
outputFileDocs,
|
||||
outputFileWords,
|
||||
outputFilePositions,
|
||||
@ -313,7 +313,7 @@ public class IndexQueryServiceIntegrationSmokeTest {
|
||||
|
||||
if (!Files.isDirectory(tmpDir)) Files.createDirectories(tmpDir);
|
||||
|
||||
var constructor = new ReverseIndexConstructor(
|
||||
var constructor = new FullIndexConstructor(
|
||||
outputFileDocs,
|
||||
outputFileWords,
|
||||
outputFilePositions,
|
||||
|
@ -7,13 +7,13 @@ import nu.marginalia.api.searchquery.model.query.SearchCoherenceConstraint;
|
||||
import nu.marginalia.api.searchquery.model.query.SearchSpecification;
|
||||
import nu.marginalia.api.searchquery.model.query.SearchQuery;
|
||||
import nu.marginalia.api.searchquery.model.results.ResultRankingParameters;
|
||||
import nu.marginalia.index.construction.full.FullIndexConstructor;
|
||||
import nu.marginalia.index.index.StatefulIndex;
|
||||
import nu.marginalia.index.journal.model.IndexJournalEntryData;
|
||||
import nu.marginalia.sequence.GammaCodedSequence;
|
||||
import nu.marginalia.storage.FileStorageService;
|
||||
import nu.marginalia.hash.MurmurHash3_128;
|
||||
import nu.marginalia.index.construction.DocIdRewriter;
|
||||
import nu.marginalia.index.construction.ReverseIndexConstructor;
|
||||
import nu.marginalia.index.forward.ForwardIndexConverter;
|
||||
import nu.marginalia.index.forward.ForwardIndexFileNames;
|
||||
import nu.marginalia.index.journal.model.IndexJournalEntryHeader;
|
||||
@ -493,7 +493,7 @@ public class IndexQueryServiceIntegrationTest {
|
||||
if (!Files.isDirectory(tmpDir)) Files.createDirectories(tmpDir);
|
||||
|
||||
var constructor =
|
||||
new ReverseIndexConstructor(
|
||||
new FullIndexConstructor(
|
||||
outputFileDocs,
|
||||
outputFileWords,
|
||||
outputFilePositions,
|
||||
@ -513,7 +513,7 @@ public class IndexQueryServiceIntegrationTest {
|
||||
|
||||
if (!Files.isDirectory(tmpDir)) Files.createDirectories(tmpDir);
|
||||
|
||||
var constructor = new ReverseIndexConstructor(
|
||||
var constructor = new FullIndexConstructor(
|
||||
outputFileDocs,
|
||||
outputFileWords,
|
||||
outputFilePositions,
|
||||
|
@ -6,10 +6,11 @@ import com.google.inject.Inject;
|
||||
import nu.marginalia.IndexLocations;
|
||||
import nu.marginalia.ProcessConfiguration;
|
||||
import nu.marginalia.ProcessConfigurationModule;
|
||||
import nu.marginalia.index.construction.prio.PrioIndexConstructor;
|
||||
import nu.marginalia.index.domainrankings.DomainRankings;
|
||||
import nu.marginalia.service.ProcessMainClass;
|
||||
import nu.marginalia.storage.FileStorageService;
|
||||
import nu.marginalia.index.construction.ReverseIndexConstructor;
|
||||
import nu.marginalia.index.construction.full.FullIndexConstructor;
|
||||
import nu.marginalia.index.forward.ForwardIndexConverter;
|
||||
import nu.marginalia.index.forward.ForwardIndexFileNames;
|
||||
import nu.marginalia.index.journal.reader.IndexJournalReader;
|
||||
@ -117,7 +118,7 @@ public class IndexConstructorMain extends ProcessMainClass {
|
||||
|
||||
if (!Files.isDirectory(tmpDir)) Files.createDirectories(tmpDir);
|
||||
|
||||
var constructor = new ReverseIndexConstructor(
|
||||
var constructor = new FullIndexConstructor(
|
||||
outputFileDocs,
|
||||
outputFileWords,
|
||||
outputFilePositions,
|
||||
@ -142,7 +143,7 @@ public class IndexConstructorMain extends ProcessMainClass {
|
||||
// important to the document. This filter will act on the encoded {@see WordMetadata}
|
||||
LongPredicate wordMetaFilter = getPriorityIndexWordMetaFilter();
|
||||
|
||||
var constructor = new ReverseIndexConstructor(
|
||||
var constructor = new PrioIndexConstructor(
|
||||
outputFileDocs,
|
||||
outputFileWords,
|
||||
outputFilePositions,
|
||||
|
@ -17,7 +17,7 @@ import nu.marginalia.functions.searchquery.QueryFactory;
|
||||
import nu.marginalia.index.IndexGrpcService;
|
||||
import nu.marginalia.index.ReverseIndexFullFileNames;
|
||||
import nu.marginalia.index.ReverseIndexPrioFileNames;
|
||||
import nu.marginalia.index.construction.ReverseIndexConstructor;
|
||||
import nu.marginalia.index.construction.full.FullIndexConstructor;
|
||||
import nu.marginalia.index.domainrankings.DomainRankings;
|
||||
import nu.marginalia.index.forward.ForwardIndexConverter;
|
||||
import nu.marginalia.index.forward.ForwardIndexFileNames;
|
||||
@ -244,7 +244,7 @@ public class IntegrationTest {
|
||||
|
||||
if (!Files.isDirectory(tmpDir)) Files.createDirectories(tmpDir);
|
||||
|
||||
var constructor = new ReverseIndexConstructor(
|
||||
var constructor = new FullIndexConstructor(
|
||||
outputFileDocs,
|
||||
outputFileWords,
|
||||
outputFilePositions,
|
||||
@ -269,7 +269,7 @@ public class IntegrationTest {
|
||||
// important to the document. This filter will act on the encoded {@see WordMetadata}
|
||||
LongPredicate wordMetaFilter = getPriorityIndexWordMetaFilter();
|
||||
|
||||
var constructor = new ReverseIndexConstructor(
|
||||
var constructor = new FullIndexConstructor(
|
||||
outputFileDocs,
|
||||
outputFileWords,
|
||||
outputFilePositions,
|
||||
|
Loading…
Reference in New Issue
Block a user