mirror of
https://github.com/MarginaliaSearch/MarginaliaSearch.git
synced 2025-02-24 05:18:58 +00:00
(index-reverse) Split index construction into separate packages for full and priority index
This commit is contained in:
parent
a4ecd5f4ce
commit
85c99ae808
@ -1,4 +1,4 @@
|
|||||||
package nu.marginalia.index.construction;
|
package nu.marginalia.index.construction.full;
|
||||||
|
|
||||||
import nu.marginalia.array.LongArray;
|
import nu.marginalia.array.LongArray;
|
||||||
import nu.marginalia.array.algo.LongArrayTransformations;
|
import nu.marginalia.array.algo.LongArrayTransformations;
|
||||||
@ -9,7 +9,7 @@ import java.io.IOException;
|
|||||||
import java.nio.channels.FileChannel;
|
import java.nio.channels.FileChannel;
|
||||||
|
|
||||||
/** Constructs the BTrees in a reverse index */
|
/** Constructs the BTrees in a reverse index */
|
||||||
public class ReverseIndexBTreeTransformer implements LongArrayTransformations.LongIOTransformer {
|
public class FullIndexBTreeTransformer implements LongArrayTransformations.LongIOTransformer {
|
||||||
private final BTreeWriter writer;
|
private final BTreeWriter writer;
|
||||||
private final FileChannel intermediateChannel;
|
private final FileChannel intermediateChannel;
|
||||||
|
|
||||||
@ -18,10 +18,10 @@ public class ReverseIndexBTreeTransformer implements LongArrayTransformations.Lo
|
|||||||
long start = 0;
|
long start = 0;
|
||||||
long writeOffset = 0;
|
long writeOffset = 0;
|
||||||
|
|
||||||
public ReverseIndexBTreeTransformer(LongArray urlsFileMap,
|
public FullIndexBTreeTransformer(LongArray urlsFileMap,
|
||||||
int entrySize,
|
int entrySize,
|
||||||
BTreeContext bTreeContext,
|
BTreeContext bTreeContext,
|
||||||
FileChannel intermediateChannel) {
|
FileChannel intermediateChannel) {
|
||||||
this.writer = new BTreeWriter(urlsFileMap, bTreeContext);
|
this.writer = new BTreeWriter(urlsFileMap, bTreeContext);
|
||||||
this.entrySize = entrySize;
|
this.entrySize = entrySize;
|
||||||
this.intermediateChannel = intermediateChannel;
|
this.intermediateChannel = intermediateChannel;
|
@ -1,6 +1,9 @@
|
|||||||
package nu.marginalia.index.construction;
|
package nu.marginalia.index.construction.full;
|
||||||
|
|
||||||
import lombok.SneakyThrows;
|
import lombok.SneakyThrows;
|
||||||
|
import nu.marginalia.index.construction.DocIdRewriter;
|
||||||
|
import nu.marginalia.index.construction.JournalReaderSource;
|
||||||
|
import nu.marginalia.index.construction.PositionsFileConstructor;
|
||||||
import nu.marginalia.process.control.ProcessHeartbeat;
|
import nu.marginalia.process.control.ProcessHeartbeat;
|
||||||
import nu.marginalia.index.journal.IndexJournalFileNames;
|
import nu.marginalia.index.journal.IndexJournalFileNames;
|
||||||
import org.slf4j.Logger;
|
import org.slf4j.Logger;
|
||||||
@ -10,9 +13,9 @@ import java.io.IOException;
|
|||||||
import java.nio.file.Path;
|
import java.nio.file.Path;
|
||||||
import java.util.concurrent.atomic.AtomicInteger;
|
import java.util.concurrent.atomic.AtomicInteger;
|
||||||
|
|
||||||
public class ReverseIndexConstructor {
|
public class FullIndexConstructor {
|
||||||
|
|
||||||
private static final Logger logger = LoggerFactory.getLogger(ReverseIndexConstructor.class);
|
private static final Logger logger = LoggerFactory.getLogger(FullIndexConstructor.class);
|
||||||
|
|
||||||
public enum CreateReverseIndexSteps {
|
public enum CreateReverseIndexSteps {
|
||||||
CONSTRUCT,
|
CONSTRUCT,
|
||||||
@ -27,12 +30,12 @@ public class ReverseIndexConstructor {
|
|||||||
private final DocIdRewriter docIdRewriter;
|
private final DocIdRewriter docIdRewriter;
|
||||||
private final Path tmpDir;
|
private final Path tmpDir;
|
||||||
|
|
||||||
public ReverseIndexConstructor(Path outputFileDocs,
|
public FullIndexConstructor(Path outputFileDocs,
|
||||||
Path outputFileWords,
|
Path outputFileWords,
|
||||||
Path outputFilePositions,
|
Path outputFilePositions,
|
||||||
JournalReaderSource readerSource,
|
JournalReaderSource readerSource,
|
||||||
DocIdRewriter docIdRewriter,
|
DocIdRewriter docIdRewriter,
|
||||||
Path tmpDir) {
|
Path tmpDir) {
|
||||||
this.outputFileDocs = outputFileDocs;
|
this.outputFileDocs = outputFileDocs;
|
||||||
this.outputFileWords = outputFileWords;
|
this.outputFileWords = outputFileWords;
|
||||||
this.outputFilePositions = outputFilePositions;
|
this.outputFilePositions = outputFilePositions;
|
||||||
@ -77,20 +80,20 @@ public class ReverseIndexConstructor {
|
|||||||
}
|
}
|
||||||
|
|
||||||
@SneakyThrows
|
@SneakyThrows
|
||||||
private ReversePreindexReference construct(Path input, PositionsFileConstructor positionsFileConstructor) {
|
private FullPreindexReference construct(Path input, PositionsFileConstructor positionsFileConstructor) {
|
||||||
return ReversePreindex
|
return FullPreindex
|
||||||
.constructPreindex(readerSource.construct(input), positionsFileConstructor, docIdRewriter, tmpDir)
|
.constructPreindex(readerSource.construct(input), positionsFileConstructor, docIdRewriter, tmpDir)
|
||||||
.closeToReference();
|
.closeToReference();
|
||||||
}
|
}
|
||||||
|
|
||||||
@SneakyThrows
|
@SneakyThrows
|
||||||
private ReversePreindexReference merge(ReversePreindexReference leftR, ReversePreindexReference rightR) {
|
private FullPreindexReference merge(FullPreindexReference leftR, FullPreindexReference rightR) {
|
||||||
|
|
||||||
var left = leftR.open();
|
var left = leftR.open();
|
||||||
var right = rightR.open();
|
var right = rightR.open();
|
||||||
|
|
||||||
try {
|
try {
|
||||||
return ReversePreindex.merge(tmpDir, left, right).closeToReference();
|
return FullPreindex.merge(tmpDir, left, right).closeToReference();
|
||||||
}
|
}
|
||||||
finally {
|
finally {
|
||||||
left.delete();
|
left.delete();
|
||||||
@ -101,7 +104,7 @@ public class ReverseIndexConstructor {
|
|||||||
}
|
}
|
||||||
|
|
||||||
@SneakyThrows
|
@SneakyThrows
|
||||||
private void finalizeIndex(ReversePreindexReference finalPR) {
|
private void finalizeIndex(FullPreindexReference finalPR) {
|
||||||
var finalP = finalPR.open();
|
var finalP = finalPR.open();
|
||||||
finalP.finalizeIndex(outputFileDocs, outputFileWords);
|
finalP.finalizeIndex(outputFileDocs, outputFileWords);
|
||||||
finalP.delete();
|
finalP.delete();
|
@ -1,9 +1,13 @@
|
|||||||
package nu.marginalia.index.construction;
|
package nu.marginalia.index.construction.full;
|
||||||
|
|
||||||
import nu.marginalia.array.LongArray;
|
import nu.marginalia.array.LongArray;
|
||||||
import nu.marginalia.array.LongArrayFactory;
|
import nu.marginalia.array.LongArrayFactory;
|
||||||
import nu.marginalia.btree.BTreeWriter;
|
import nu.marginalia.btree.BTreeWriter;
|
||||||
import nu.marginalia.index.ReverseIndexParameters;
|
import nu.marginalia.index.ReverseIndexParameters;
|
||||||
|
import nu.marginalia.index.construction.CountToOffsetTransformer;
|
||||||
|
import nu.marginalia.index.construction.DocIdRewriter;
|
||||||
|
import nu.marginalia.index.construction.IndexSizeEstimator;
|
||||||
|
import nu.marginalia.index.construction.PositionsFileConstructor;
|
||||||
import nu.marginalia.index.journal.reader.IndexJournalReader;
|
import nu.marginalia.index.journal.reader.IndexJournalReader;
|
||||||
import org.slf4j.Logger;
|
import org.slf4j.Logger;
|
||||||
import org.slf4j.LoggerFactory;
|
import org.slf4j.LoggerFactory;
|
||||||
@ -25,13 +29,13 @@ import static nu.marginalia.array.algo.TwoArrayOperations.*;
|
|||||||
* the union of their data. This operation requires no additional
|
* the union of their data. This operation requires no additional
|
||||||
* RAM.
|
* RAM.
|
||||||
*/
|
*/
|
||||||
public class ReversePreindex {
|
public class FullPreindex {
|
||||||
final ReversePreindexWordSegments segments;
|
final FullPreindexWordSegments segments;
|
||||||
final ReversePreindexDocuments documents;
|
final FullPreindexDocuments documents;
|
||||||
|
|
||||||
private static final Logger logger = LoggerFactory.getLogger(ReversePreindex.class);
|
private static final Logger logger = LoggerFactory.getLogger(FullPreindex.class);
|
||||||
|
|
||||||
public ReversePreindex(ReversePreindexWordSegments segments, ReversePreindexDocuments documents) {
|
public FullPreindex(FullPreindexWordSegments segments, FullPreindexDocuments documents) {
|
||||||
this.segments = segments;
|
this.segments = segments;
|
||||||
this.documents = documents;
|
this.documents = documents;
|
||||||
}
|
}
|
||||||
@ -39,27 +43,27 @@ public class ReversePreindex {
|
|||||||
/** Constructs a new preindex with the data associated with reader. The backing files
|
/** Constructs a new preindex with the data associated with reader. The backing files
|
||||||
* will have randomly assigned names.
|
* will have randomly assigned names.
|
||||||
*/
|
*/
|
||||||
public static ReversePreindex constructPreindex(IndexJournalReader reader,
|
public static FullPreindex constructPreindex(IndexJournalReader reader,
|
||||||
PositionsFileConstructor positionsFileConstructor,
|
PositionsFileConstructor positionsFileConstructor,
|
||||||
DocIdRewriter docIdRewriter,
|
DocIdRewriter docIdRewriter,
|
||||||
Path workDir) throws IOException
|
Path workDir) throws IOException
|
||||||
{
|
{
|
||||||
Path segmentWordsFile = Files.createTempFile(workDir, "segment_words", ".dat");
|
Path segmentWordsFile = Files.createTempFile(workDir, "segment_words", ".dat");
|
||||||
Path segmentCountsFile = Files.createTempFile(workDir, "segment_counts", ".dat");
|
Path segmentCountsFile = Files.createTempFile(workDir, "segment_counts", ".dat");
|
||||||
Path docsFile = Files.createTempFile(workDir, "docs", ".dat");
|
Path docsFile = Files.createTempFile(workDir, "docs", ".dat");
|
||||||
|
|
||||||
var segments = ReversePreindexWordSegments.construct(reader, segmentWordsFile, segmentCountsFile);
|
var segments = FullPreindexWordSegments.construct(reader, segmentWordsFile, segmentCountsFile);
|
||||||
var docs = ReversePreindexDocuments.construct(docsFile, workDir, reader, docIdRewriter, positionsFileConstructor, segments);
|
var docs = FullPreindexDocuments.construct(docsFile, workDir, reader, docIdRewriter, positionsFileConstructor, segments);
|
||||||
return new ReversePreindex(segments, docs);
|
return new FullPreindex(segments, docs);
|
||||||
}
|
}
|
||||||
|
|
||||||
/** Close the associated memory mapped areas and return
|
/** Close the associated memory mapped areas and return
|
||||||
* a dehydrated version of this object that can be re-opened
|
* a dehydrated version of this object that can be re-opened
|
||||||
* later.
|
* later.
|
||||||
*/
|
*/
|
||||||
public ReversePreindexReference closeToReference() {
|
public FullPreindexReference closeToReference() {
|
||||||
try {
|
try {
|
||||||
return new ReversePreindexReference(segments, documents);
|
return new FullPreindexReference(segments, documents);
|
||||||
}
|
}
|
||||||
finally {
|
finally {
|
||||||
segments.force();
|
segments.force();
|
||||||
@ -85,7 +89,7 @@ public class ReversePreindex {
|
|||||||
LongArray finalDocs = LongArrayFactory.mmapForWritingConfined(outputFileDocs, sizeEstimator.size);
|
LongArray finalDocs = LongArrayFactory.mmapForWritingConfined(outputFileDocs, sizeEstimator.size);
|
||||||
try (var intermediateDocChannel = documents.createDocumentsFileChannel()) {
|
try (var intermediateDocChannel = documents.createDocumentsFileChannel()) {
|
||||||
offsets.transformEachIO(0, offsets.size(),
|
offsets.transformEachIO(0, offsets.size(),
|
||||||
new ReverseIndexBTreeTransformer(finalDocs, 2,
|
new FullIndexBTreeTransformer(finalDocs, 2,
|
||||||
ReverseIndexParameters.docsBTreeContext,
|
ReverseIndexParameters.docsBTreeContext,
|
||||||
intermediateDocChannel));
|
intermediateDocChannel));
|
||||||
intermediateDocChannel.force(false);
|
intermediateDocChannel.force(false);
|
||||||
@ -126,11 +130,11 @@ public class ReversePreindex {
|
|||||||
documents.delete();
|
documents.delete();
|
||||||
}
|
}
|
||||||
|
|
||||||
public static ReversePreindex merge(Path destDir,
|
public static FullPreindex merge(Path destDir,
|
||||||
ReversePreindex left,
|
FullPreindex left,
|
||||||
ReversePreindex right) throws IOException {
|
FullPreindex right) throws IOException {
|
||||||
|
|
||||||
ReversePreindexWordSegments mergingSegment =
|
FullPreindexWordSegments mergingSegment =
|
||||||
createMergedSegmentWordFile(destDir, left.segments, right.segments);
|
createMergedSegmentWordFile(destDir, left.segments, right.segments);
|
||||||
|
|
||||||
var mergingIter = mergingSegment.constructionIterator(2);
|
var mergingIter = mergingSegment.constructionIterator(2);
|
||||||
@ -198,18 +202,18 @@ public class ReversePreindex {
|
|||||||
mergedDocuments = shrinkMergedDocuments(mergedDocuments,
|
mergedDocuments = shrinkMergedDocuments(mergedDocuments,
|
||||||
docsFile, 2 * mergingSegment.totalSize());
|
docsFile, 2 * mergingSegment.totalSize());
|
||||||
|
|
||||||
return new ReversePreindex(
|
return new FullPreindex(
|
||||||
mergingSegment,
|
mergingSegment,
|
||||||
new ReversePreindexDocuments(mergedDocuments, docsFile)
|
new FullPreindexDocuments(mergedDocuments, docsFile)
|
||||||
);
|
);
|
||||||
}
|
}
|
||||||
|
|
||||||
/** Create a segment word file with each word from both inputs, with zero counts for all the data.
|
/** Create a segment word file with each word from both inputs, with zero counts for all the data.
|
||||||
* This is an intermediate product in merging.
|
* This is an intermediate product in merging.
|
||||||
*/
|
*/
|
||||||
static ReversePreindexWordSegments createMergedSegmentWordFile(Path destDir,
|
static FullPreindexWordSegments createMergedSegmentWordFile(Path destDir,
|
||||||
ReversePreindexWordSegments left,
|
FullPreindexWordSegments left,
|
||||||
ReversePreindexWordSegments right) throws IOException {
|
FullPreindexWordSegments right) throws IOException {
|
||||||
Path segmentWordsFile = Files.createTempFile(destDir, "segment_words", ".dat");
|
Path segmentWordsFile = Files.createTempFile(destDir, "segment_words", ".dat");
|
||||||
Path segmentCountsFile = Files.createTempFile(destDir, "segment_counts", ".dat");
|
Path segmentCountsFile = Files.createTempFile(destDir, "segment_counts", ".dat");
|
||||||
|
|
||||||
@ -228,7 +232,7 @@ public class ReversePreindex {
|
|||||||
|
|
||||||
LongArray counts = LongArrayFactory.mmapForWritingConfined(segmentCountsFile, segmentsSize);
|
LongArray counts = LongArrayFactory.mmapForWritingConfined(segmentCountsFile, segmentsSize);
|
||||||
|
|
||||||
return new ReversePreindexWordSegments(wordIdsFile, counts, segmentWordsFile, segmentCountsFile);
|
return new FullPreindexWordSegments(wordIdsFile, counts, segmentWordsFile, segmentCountsFile);
|
||||||
}
|
}
|
||||||
|
|
||||||
/** It's possible we overestimated the necessary size of the documents file,
|
/** It's possible we overestimated the necessary size of the documents file,
|
||||||
@ -256,12 +260,12 @@ public class ReversePreindex {
|
|||||||
/** Merge contents of the segments indicated by leftIter and rightIter into the destionation
|
/** Merge contents of the segments indicated by leftIter and rightIter into the destionation
|
||||||
* segment, and advance the construction iterator with the appropriate size.
|
* segment, and advance the construction iterator with the appropriate size.
|
||||||
*/
|
*/
|
||||||
private static void mergeSegments(ReversePreindexWordSegments.SegmentIterator leftIter,
|
private static void mergeSegments(FullPreindexWordSegments.SegmentIterator leftIter,
|
||||||
ReversePreindexWordSegments.SegmentIterator rightIter,
|
FullPreindexWordSegments.SegmentIterator rightIter,
|
||||||
ReversePreindexDocuments left,
|
FullPreindexDocuments left,
|
||||||
ReversePreindexDocuments right,
|
FullPreindexDocuments right,
|
||||||
LongArray dest,
|
LongArray dest,
|
||||||
ReversePreindexWordSegments.SegmentConstructionIterator destIter)
|
FullPreindexWordSegments.SegmentConstructionIterator destIter)
|
||||||
{
|
{
|
||||||
long segSize = mergeArrays2(dest,
|
long segSize = mergeArrays2(dest,
|
||||||
left.documents,
|
left.documents,
|
||||||
@ -279,10 +283,10 @@ public class ReversePreindex {
|
|||||||
/** Copy the data from the source segment at the position and length indicated by sourceIter,
|
/** Copy the data from the source segment at the position and length indicated by sourceIter,
|
||||||
* into the destination segment, and advance the construction iterator.
|
* into the destination segment, and advance the construction iterator.
|
||||||
*/
|
*/
|
||||||
private static boolean copySegment(ReversePreindexWordSegments.SegmentIterator sourceIter,
|
private static boolean copySegment(FullPreindexWordSegments.SegmentIterator sourceIter,
|
||||||
LongArray dest,
|
LongArray dest,
|
||||||
FileChannel sourceChannel,
|
FileChannel sourceChannel,
|
||||||
ReversePreindexWordSegments.SegmentConstructionIterator mergingIter) throws IOException {
|
FullPreindexWordSegments.SegmentConstructionIterator mergingIter) throws IOException {
|
||||||
|
|
||||||
long size = sourceIter.endOffset - sourceIter.startOffset;
|
long size = sourceIter.endOffset - sourceIter.startOffset;
|
||||||
long start = mergingIter.startOffset;
|
long start = mergingIter.startOffset;
|
@ -1,8 +1,10 @@
|
|||||||
package nu.marginalia.index.construction;
|
package nu.marginalia.index.construction.full;
|
||||||
|
|
||||||
import lombok.SneakyThrows;
|
import lombok.SneakyThrows;
|
||||||
import nu.marginalia.array.LongArray;
|
import nu.marginalia.array.LongArray;
|
||||||
import nu.marginalia.array.LongArrayFactory;
|
import nu.marginalia.array.LongArrayFactory;
|
||||||
|
import nu.marginalia.index.construction.DocIdRewriter;
|
||||||
|
import nu.marginalia.index.construction.PositionsFileConstructor;
|
||||||
import nu.marginalia.index.journal.reader.IndexJournalReader;
|
import nu.marginalia.index.journal.reader.IndexJournalReader;
|
||||||
import nu.marginalia.rwf.RandomFileAssembler;
|
import nu.marginalia.rwf.RandomFileAssembler;
|
||||||
import org.slf4j.Logger;
|
import org.slf4j.Logger;
|
||||||
@ -20,35 +22,35 @@ import java.util.concurrent.TimeUnit;
|
|||||||
/** A LongArray with document data, segmented according to
|
/** A LongArray with document data, segmented according to
|
||||||
* the associated ReversePreindexWordSegments data
|
* the associated ReversePreindexWordSegments data
|
||||||
*/
|
*/
|
||||||
public class ReversePreindexDocuments {
|
public class FullPreindexDocuments {
|
||||||
public final LongArray documents;
|
public final LongArray documents;
|
||||||
|
|
||||||
private static PositionsFileConstructor positionsFileConstructor;
|
private static PositionsFileConstructor positionsFileConstructor;
|
||||||
private static final int RECORD_SIZE_LONGS = 2;
|
private static final int RECORD_SIZE_LONGS = 2;
|
||||||
private static final Logger logger = LoggerFactory.getLogger(ReversePreindexDocuments.class);
|
private static final Logger logger = LoggerFactory.getLogger(FullPreindexDocuments.class);
|
||||||
|
|
||||||
public final Path file;
|
public final Path file;
|
||||||
|
|
||||||
public ReversePreindexDocuments(LongArray documents, Path file) {
|
public FullPreindexDocuments(LongArray documents, Path file) {
|
||||||
this.documents = documents;
|
this.documents = documents;
|
||||||
this.file = file;
|
this.file = file;
|
||||||
}
|
}
|
||||||
|
|
||||||
public static ReversePreindexDocuments construct(
|
public static FullPreindexDocuments construct(
|
||||||
Path docsFile,
|
Path docsFile,
|
||||||
Path workDir,
|
Path workDir,
|
||||||
IndexJournalReader reader,
|
IndexJournalReader reader,
|
||||||
DocIdRewriter docIdRewriter,
|
DocIdRewriter docIdRewriter,
|
||||||
PositionsFileConstructor positionsFileConstructor,
|
PositionsFileConstructor positionsFileConstructor,
|
||||||
ReversePreindexWordSegments segments) throws IOException {
|
FullPreindexWordSegments segments) throws IOException {
|
||||||
ReversePreindexDocuments.positionsFileConstructor = positionsFileConstructor;
|
FullPreindexDocuments.positionsFileConstructor = positionsFileConstructor;
|
||||||
|
|
||||||
createUnsortedDocsFile(docsFile, workDir, reader, segments, docIdRewriter);
|
createUnsortedDocsFile(docsFile, workDir, reader, segments, docIdRewriter);
|
||||||
|
|
||||||
LongArray docsFileMap = LongArrayFactory.mmapForModifyingShared(docsFile);
|
LongArray docsFileMap = LongArrayFactory.mmapForModifyingShared(docsFile);
|
||||||
sortDocsFile(docsFileMap, segments);
|
sortDocsFile(docsFileMap, segments);
|
||||||
|
|
||||||
return new ReversePreindexDocuments(docsFileMap, docsFile);
|
return new FullPreindexDocuments(docsFileMap, docsFile);
|
||||||
}
|
}
|
||||||
|
|
||||||
public FileChannel createDocumentsFileChannel() throws IOException {
|
public FileChannel createDocumentsFileChannel() throws IOException {
|
||||||
@ -67,7 +69,7 @@ public class ReversePreindexDocuments {
|
|||||||
private static void createUnsortedDocsFile(Path docsFile,
|
private static void createUnsortedDocsFile(Path docsFile,
|
||||||
Path workDir,
|
Path workDir,
|
||||||
IndexJournalReader reader,
|
IndexJournalReader reader,
|
||||||
ReversePreindexWordSegments segments,
|
FullPreindexWordSegments segments,
|
||||||
DocIdRewriter docIdRewriter) throws IOException {
|
DocIdRewriter docIdRewriter) throws IOException {
|
||||||
|
|
||||||
long fileSizeLongs = RECORD_SIZE_LONGS * segments.totalSize();
|
long fileSizeLongs = RECORD_SIZE_LONGS * segments.totalSize();
|
||||||
@ -99,7 +101,7 @@ public class ReversePreindexDocuments {
|
|||||||
}
|
}
|
||||||
|
|
||||||
@SneakyThrows
|
@SneakyThrows
|
||||||
private static void sortDocsFile(LongArray docsFileMap, ReversePreindexWordSegments segments) throws IOException {
|
private static void sortDocsFile(LongArray docsFileMap, FullPreindexWordSegments segments) throws IOException {
|
||||||
|
|
||||||
var iter = segments.iterator(RECORD_SIZE_LONGS);
|
var iter = segments.iterator(RECORD_SIZE_LONGS);
|
||||||
|
|
@ -1,33 +1,33 @@
|
|||||||
package nu.marginalia.index.construction;
|
package nu.marginalia.index.construction.full;
|
||||||
|
|
||||||
import nu.marginalia.array.LongArrayFactory;
|
import nu.marginalia.array.LongArrayFactory;
|
||||||
|
|
||||||
import java.io.IOException;
|
import java.io.IOException;
|
||||||
import java.nio.file.Path;
|
import java.nio.file.Path;
|
||||||
|
|
||||||
/** This is a dehydrated version of a ReversePreIndex, that only
|
/** This is a dehydrated version of a FullPreIndex, that only
|
||||||
* keeps references to its location on disk but does not hold associated
|
* keeps references to its location on disk but does not hold associated
|
||||||
* memory maps.
|
* memory maps.
|
||||||
*/
|
*/
|
||||||
public record ReversePreindexReference(
|
public record FullPreindexReference(
|
||||||
Path wordsFile,
|
Path wordsFile,
|
||||||
Path countsFile,
|
Path countsFile,
|
||||||
Path documentsFile
|
Path documentsFile
|
||||||
)
|
)
|
||||||
{
|
{
|
||||||
public ReversePreindexReference(ReversePreindexWordSegments segments, ReversePreindexDocuments documents) {
|
public FullPreindexReference(FullPreindexWordSegments segments, FullPreindexDocuments documents) {
|
||||||
this(segments.wordsFile, segments.countsFile, documents.file);
|
this(segments.wordsFile, segments.countsFile, documents.file);
|
||||||
}
|
}
|
||||||
|
|
||||||
public ReversePreindex open() throws IOException {
|
public FullPreindex open() throws IOException {
|
||||||
return new ReversePreindex(
|
return new FullPreindex(
|
||||||
new ReversePreindexWordSegments(
|
new FullPreindexWordSegments(
|
||||||
LongArrayFactory.mmapForModifyingShared(wordsFile),
|
LongArrayFactory.mmapForModifyingShared(wordsFile),
|
||||||
LongArrayFactory.mmapForModifyingShared(countsFile),
|
LongArrayFactory.mmapForModifyingShared(countsFile),
|
||||||
wordsFile,
|
wordsFile,
|
||||||
countsFile
|
countsFile
|
||||||
),
|
),
|
||||||
new ReversePreindexDocuments(
|
new FullPreindexDocuments(
|
||||||
LongArrayFactory.mmapForModifyingShared(documentsFile),
|
LongArrayFactory.mmapForModifyingShared(documentsFile),
|
||||||
documentsFile
|
documentsFile
|
||||||
)
|
)
|
@ -1,4 +1,4 @@
|
|||||||
package nu.marginalia.index.construction;
|
package nu.marginalia.index.construction.full;
|
||||||
|
|
||||||
import it.unimi.dsi.fastutil.longs.Long2IntOpenHashMap;
|
import it.unimi.dsi.fastutil.longs.Long2IntOpenHashMap;
|
||||||
import it.unimi.dsi.fastutil.longs.Long2LongOpenHashMap;
|
import it.unimi.dsi.fastutil.longs.Long2LongOpenHashMap;
|
||||||
@ -14,17 +14,17 @@ import java.nio.file.Path;
|
|||||||
/** A pair of file-backed arrays of sorted wordIds
|
/** A pair of file-backed arrays of sorted wordIds
|
||||||
* and the count of documents associated with each termId.
|
* and the count of documents associated with each termId.
|
||||||
*/
|
*/
|
||||||
public class ReversePreindexWordSegments {
|
public class FullPreindexWordSegments {
|
||||||
public final LongArray wordIds;
|
public final LongArray wordIds;
|
||||||
public final LongArray counts;
|
public final LongArray counts;
|
||||||
|
|
||||||
final Path wordsFile;
|
final Path wordsFile;
|
||||||
final Path countsFile;
|
final Path countsFile;
|
||||||
|
|
||||||
public ReversePreindexWordSegments(LongArray wordIds,
|
public FullPreindexWordSegments(LongArray wordIds,
|
||||||
LongArray counts,
|
LongArray counts,
|
||||||
Path wordsFile,
|
Path wordsFile,
|
||||||
Path countsFile)
|
Path countsFile)
|
||||||
{
|
{
|
||||||
assert wordIds.size() == counts.size();
|
assert wordIds.size() == counts.size();
|
||||||
|
|
||||||
@ -51,9 +51,9 @@ public class ReversePreindexWordSegments {
|
|||||||
return ret;
|
return ret;
|
||||||
}
|
}
|
||||||
|
|
||||||
public static ReversePreindexWordSegments construct(IndexJournalReader reader,
|
public static FullPreindexWordSegments construct(IndexJournalReader reader,
|
||||||
Path wordIdsFile,
|
Path wordIdsFile,
|
||||||
Path countsFile)
|
Path countsFile)
|
||||||
throws IOException
|
throws IOException
|
||||||
{
|
{
|
||||||
Long2IntOpenHashMap countsMap = new Long2IntOpenHashMap(100_000, 0.75f);
|
Long2IntOpenHashMap countsMap = new Long2IntOpenHashMap(100_000, 0.75f);
|
||||||
@ -79,7 +79,7 @@ public class ReversePreindexWordSegments {
|
|||||||
counts.set(i, countsMap.get(words.get(i)));
|
counts.set(i, countsMap.get(words.get(i)));
|
||||||
}
|
}
|
||||||
|
|
||||||
return new ReversePreindexWordSegments(words, counts, wordIdsFile, countsFile);
|
return new FullPreindexWordSegments(words, counts, wordIdsFile, countsFile);
|
||||||
}
|
}
|
||||||
|
|
||||||
public SegmentIterator iterator(int recordSize) {
|
public SegmentIterator iterator(int recordSize) {
|
@ -0,0 +1,48 @@
|
|||||||
|
package nu.marginalia.index.construction.prio;
|
||||||
|
|
||||||
|
import nu.marginalia.array.LongArray;
|
||||||
|
import nu.marginalia.array.algo.LongArrayTransformations;
|
||||||
|
import nu.marginalia.btree.BTreeWriter;
|
||||||
|
import nu.marginalia.btree.model.BTreeContext;
|
||||||
|
|
||||||
|
import java.io.IOException;
|
||||||
|
import java.nio.channels.FileChannel;
|
||||||
|
|
||||||
|
/** Constructs the BTrees in a reverse index */
|
||||||
|
public class PrioIndexBTreeTransformer implements LongArrayTransformations.LongIOTransformer {
|
||||||
|
private final BTreeWriter writer;
|
||||||
|
private final FileChannel intermediateChannel;
|
||||||
|
|
||||||
|
private final int entrySize;
|
||||||
|
|
||||||
|
long start = 0;
|
||||||
|
long writeOffset = 0;
|
||||||
|
|
||||||
|
public PrioIndexBTreeTransformer(LongArray urlsFileMap,
|
||||||
|
int entrySize,
|
||||||
|
BTreeContext bTreeContext,
|
||||||
|
FileChannel intermediateChannel) {
|
||||||
|
this.writer = new BTreeWriter(urlsFileMap, bTreeContext);
|
||||||
|
this.entrySize = entrySize;
|
||||||
|
this.intermediateChannel = intermediateChannel;
|
||||||
|
}
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public long transform(long pos, long end) throws IOException {
|
||||||
|
|
||||||
|
final int size = (int) ((end - start) / entrySize);
|
||||||
|
|
||||||
|
if (size == 0) {
|
||||||
|
return -1;
|
||||||
|
}
|
||||||
|
|
||||||
|
final long offsetForBlock = writeOffset;
|
||||||
|
|
||||||
|
writeOffset += writer.write(writeOffset, size,
|
||||||
|
mapRegion -> mapRegion.transferFrom(intermediateChannel, start, 0, end - start)
|
||||||
|
);
|
||||||
|
|
||||||
|
start = end;
|
||||||
|
return offsetForBlock;
|
||||||
|
}
|
||||||
|
}
|
@ -0,0 +1,114 @@
|
|||||||
|
package nu.marginalia.index.construction.prio;
|
||||||
|
|
||||||
|
import lombok.SneakyThrows;
|
||||||
|
import nu.marginalia.index.construction.DocIdRewriter;
|
||||||
|
import nu.marginalia.index.construction.JournalReaderSource;
|
||||||
|
import nu.marginalia.index.construction.PositionsFileConstructor;
|
||||||
|
import nu.marginalia.index.journal.IndexJournalFileNames;
|
||||||
|
import nu.marginalia.process.control.ProcessHeartbeat;
|
||||||
|
import org.slf4j.Logger;
|
||||||
|
import org.slf4j.LoggerFactory;
|
||||||
|
|
||||||
|
import java.io.IOException;
|
||||||
|
import java.nio.file.Path;
|
||||||
|
import java.util.concurrent.atomic.AtomicInteger;
|
||||||
|
|
||||||
|
public class PrioIndexConstructor {
|
||||||
|
|
||||||
|
private static final Logger logger = LoggerFactory.getLogger(PrioIndexConstructor.class);
|
||||||
|
|
||||||
|
public enum CreateReverseIndexSteps {
|
||||||
|
CONSTRUCT,
|
||||||
|
FINALIZE,
|
||||||
|
FINISHED
|
||||||
|
}
|
||||||
|
|
||||||
|
private final Path outputFileDocs;
|
||||||
|
private final Path outputFileWords;
|
||||||
|
private final Path outputFilePositions;
|
||||||
|
private final JournalReaderSource readerSource;
|
||||||
|
private final DocIdRewriter docIdRewriter;
|
||||||
|
private final Path tmpDir;
|
||||||
|
|
||||||
|
public PrioIndexConstructor(Path outputFileDocs,
|
||||||
|
Path outputFileWords,
|
||||||
|
Path outputFilePositions,
|
||||||
|
JournalReaderSource readerSource,
|
||||||
|
DocIdRewriter docIdRewriter,
|
||||||
|
Path tmpDir) {
|
||||||
|
this.outputFileDocs = outputFileDocs;
|
||||||
|
this.outputFileWords = outputFileWords;
|
||||||
|
this.outputFilePositions = outputFilePositions;
|
||||||
|
this.readerSource = readerSource;
|
||||||
|
this.docIdRewriter = docIdRewriter;
|
||||||
|
this.tmpDir = tmpDir;
|
||||||
|
}
|
||||||
|
|
||||||
|
public void createReverseIndex(ProcessHeartbeat processHeartbeat,
|
||||||
|
String processName,
|
||||||
|
Path sourceBaseDir) throws IOException
|
||||||
|
{
|
||||||
|
var inputs = IndexJournalFileNames.findJournalFiles(sourceBaseDir);
|
||||||
|
if (inputs.isEmpty()) {
|
||||||
|
logger.error("No journal files in base dir {}", sourceBaseDir);
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
|
||||||
|
try (var heartbeat = processHeartbeat.createProcessTaskHeartbeat(CreateReverseIndexSteps.class, processName);
|
||||||
|
var preindexHeartbeat = processHeartbeat.createAdHocTaskHeartbeat("constructPreindexes");
|
||||||
|
var posConstructor = new PositionsFileConstructor(outputFilePositions)
|
||||||
|
) {
|
||||||
|
heartbeat.progress(CreateReverseIndexSteps.CONSTRUCT);
|
||||||
|
|
||||||
|
AtomicInteger progress = new AtomicInteger(0);
|
||||||
|
|
||||||
|
inputs
|
||||||
|
.parallelStream()
|
||||||
|
.map(in -> {
|
||||||
|
preindexHeartbeat.progress("PREINDEX/MERGE", progress.incrementAndGet(), inputs.size());
|
||||||
|
return construct(in, posConstructor);
|
||||||
|
})
|
||||||
|
.reduce(this::merge)
|
||||||
|
.ifPresent((index) -> {
|
||||||
|
heartbeat.progress(CreateReverseIndexSteps.FINALIZE);
|
||||||
|
finalizeIndex(index);
|
||||||
|
heartbeat.progress(CreateReverseIndexSteps.FINISHED);
|
||||||
|
});
|
||||||
|
|
||||||
|
heartbeat.progress(CreateReverseIndexSteps.FINISHED);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
@SneakyThrows
|
||||||
|
private PrioPreindexReference construct(Path input, PositionsFileConstructor positionsFileConstructor) {
|
||||||
|
return PrioPreindex
|
||||||
|
.constructPreindex(readerSource.construct(input), positionsFileConstructor, docIdRewriter, tmpDir)
|
||||||
|
.closeToReference();
|
||||||
|
}
|
||||||
|
|
||||||
|
@SneakyThrows
|
||||||
|
private PrioPreindexReference merge(PrioPreindexReference leftR, PrioPreindexReference rightR) {
|
||||||
|
|
||||||
|
var left = leftR.open();
|
||||||
|
var right = rightR.open();
|
||||||
|
|
||||||
|
try {
|
||||||
|
return PrioPreindex.merge(tmpDir, left, right).closeToReference();
|
||||||
|
}
|
||||||
|
finally {
|
||||||
|
left.delete();
|
||||||
|
right.delete();
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
}
|
||||||
|
|
||||||
|
@SneakyThrows
|
||||||
|
private void finalizeIndex(PrioPreindexReference finalPR) {
|
||||||
|
var finalP = finalPR.open();
|
||||||
|
finalP.finalizeIndex(outputFileDocs, outputFileWords);
|
||||||
|
finalP.delete();
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
}
|
@ -0,0 +1,310 @@
|
|||||||
|
package nu.marginalia.index.construction.prio;
|
||||||
|
|
||||||
|
import nu.marginalia.array.LongArray;
|
||||||
|
import nu.marginalia.array.LongArrayFactory;
|
||||||
|
import nu.marginalia.btree.BTreeWriter;
|
||||||
|
import nu.marginalia.index.ReverseIndexParameters;
|
||||||
|
import nu.marginalia.index.construction.CountToOffsetTransformer;
|
||||||
|
import nu.marginalia.index.construction.DocIdRewriter;
|
||||||
|
import nu.marginalia.index.construction.IndexSizeEstimator;
|
||||||
|
import nu.marginalia.index.construction.PositionsFileConstructor;
|
||||||
|
import nu.marginalia.index.journal.reader.IndexJournalReader;
|
||||||
|
import org.slf4j.Logger;
|
||||||
|
import org.slf4j.LoggerFactory;
|
||||||
|
|
||||||
|
import java.io.IOException;
|
||||||
|
import java.nio.channels.FileChannel;
|
||||||
|
import java.nio.file.Files;
|
||||||
|
import java.nio.file.Path;
|
||||||
|
import java.nio.file.StandardOpenOption;
|
||||||
|
|
||||||
|
import static nu.marginalia.array.algo.TwoArrayOperations.*;
|
||||||
|
|
||||||
|
/** Contains the data that would go into a reverse index,
|
||||||
|
* that is, a mapping from words to documents, minus the actual
|
||||||
|
* index structure that makes the data quick to access while
|
||||||
|
* searching.
|
||||||
|
* <p>
|
||||||
|
* Two preindexes can be merged into a third preindex containing
|
||||||
|
* the union of their data. This operation requires no additional
|
||||||
|
* RAM.
|
||||||
|
*/
|
||||||
|
public class PrioPreindex {
|
||||||
|
final PrioPreindexWordSegments segments;
|
||||||
|
final PrioPreindexDocuments documents;
|
||||||
|
|
||||||
|
private static final Logger logger = LoggerFactory.getLogger(PrioPreindex.class);
|
||||||
|
|
||||||
|
public PrioPreindex(PrioPreindexWordSegments segments, PrioPreindexDocuments documents) {
|
||||||
|
this.segments = segments;
|
||||||
|
this.documents = documents;
|
||||||
|
}
|
||||||
|
|
||||||
|
/** Constructs a new preindex with the data associated with reader. The backing files
|
||||||
|
* will have randomly assigned names.
|
||||||
|
*/
|
||||||
|
public static PrioPreindex constructPreindex(IndexJournalReader reader,
|
||||||
|
PositionsFileConstructor positionsFileConstructor,
|
||||||
|
DocIdRewriter docIdRewriter,
|
||||||
|
Path workDir) throws IOException
|
||||||
|
{
|
||||||
|
Path segmentWordsFile = Files.createTempFile(workDir, "segment_words", ".dat");
|
||||||
|
Path segmentCountsFile = Files.createTempFile(workDir, "segment_counts", ".dat");
|
||||||
|
Path docsFile = Files.createTempFile(workDir, "docs", ".dat");
|
||||||
|
|
||||||
|
var segments = PrioPreindexWordSegments.construct(reader, segmentWordsFile, segmentCountsFile);
|
||||||
|
var docs = PrioPreindexDocuments.construct(docsFile, workDir, reader, docIdRewriter, positionsFileConstructor, segments);
|
||||||
|
return new PrioPreindex(segments, docs);
|
||||||
|
}
|
||||||
|
|
||||||
|
/** Close the associated memory mapped areas and return
|
||||||
|
* a dehydrated version of this object that can be re-opened
|
||||||
|
* later.
|
||||||
|
*/
|
||||||
|
public PrioPreindexReference closeToReference() {
|
||||||
|
try {
|
||||||
|
return new PrioPreindexReference(segments, documents);
|
||||||
|
}
|
||||||
|
finally {
|
||||||
|
segments.force();
|
||||||
|
documents.force();
|
||||||
|
segments.close();
|
||||||
|
documents.close();
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
/** Transform the preindex into a reverse index */
|
||||||
|
public void finalizeIndex(Path outputFileDocs, Path outputFileWords) throws IOException {
|
||||||
|
var offsets = segments.counts;
|
||||||
|
|
||||||
|
Files.deleteIfExists(outputFileDocs);
|
||||||
|
Files.deleteIfExists(outputFileWords);
|
||||||
|
|
||||||
|
// Estimate the size of the docs index data
|
||||||
|
offsets.transformEach(0, offsets.size(), new CountToOffsetTransformer(2));
|
||||||
|
IndexSizeEstimator sizeEstimator = new IndexSizeEstimator(ReverseIndexParameters.docsBTreeContext, 2);
|
||||||
|
offsets.fold(0, 0, offsets.size(), sizeEstimator);
|
||||||
|
|
||||||
|
// Write the docs file
|
||||||
|
LongArray finalDocs = LongArrayFactory.mmapForWritingConfined(outputFileDocs, sizeEstimator.size);
|
||||||
|
try (var intermediateDocChannel = documents.createDocumentsFileChannel()) {
|
||||||
|
offsets.transformEachIO(0, offsets.size(),
|
||||||
|
new PrioIndexBTreeTransformer(finalDocs, 2,
|
||||||
|
ReverseIndexParameters.docsBTreeContext,
|
||||||
|
intermediateDocChannel));
|
||||||
|
intermediateDocChannel.force(false);
|
||||||
|
}
|
||||||
|
|
||||||
|
LongArray wordIds = segments.wordIds;
|
||||||
|
|
||||||
|
if (offsets.size() != wordIds.size())
|
||||||
|
throw new IllegalStateException("Offsets and word-ids of different size");
|
||||||
|
if (offsets.size() > Integer.MAX_VALUE) {
|
||||||
|
throw new IllegalStateException("offsets.size() too big!");
|
||||||
|
}
|
||||||
|
|
||||||
|
// Estimate the size of the words index data
|
||||||
|
long wordsSize = ReverseIndexParameters.wordsBTreeContext.calculateSize((int) offsets.size());
|
||||||
|
|
||||||
|
// Construct the tree
|
||||||
|
LongArray wordsArray = LongArrayFactory.mmapForWritingConfined(outputFileWords, wordsSize);
|
||||||
|
|
||||||
|
new BTreeWriter(wordsArray, ReverseIndexParameters.wordsBTreeContext)
|
||||||
|
.write(0, (int) offsets.size(), mapRegion -> {
|
||||||
|
for (long i = 0; i < offsets.size(); i++) {
|
||||||
|
mapRegion.set(2*i, wordIds.get(i));
|
||||||
|
mapRegion.set(2*i + 1, offsets.get(i));
|
||||||
|
}
|
||||||
|
});
|
||||||
|
|
||||||
|
finalDocs.force();
|
||||||
|
finalDocs.close();
|
||||||
|
wordsArray.force();
|
||||||
|
wordsArray.close();
|
||||||
|
|
||||||
|
}
|
||||||
|
|
||||||
|
/** Delete all files associated with this pre-index */
|
||||||
|
public void delete() throws IOException {
|
||||||
|
segments.delete();
|
||||||
|
documents.delete();
|
||||||
|
}
|
||||||
|
|
||||||
|
public static PrioPreindex merge(Path destDir,
|
||||||
|
PrioPreindex left,
|
||||||
|
PrioPreindex right) throws IOException {
|
||||||
|
|
||||||
|
PrioPreindexWordSegments mergingSegment =
|
||||||
|
createMergedSegmentWordFile(destDir, left.segments, right.segments);
|
||||||
|
|
||||||
|
var mergingIter = mergingSegment.constructionIterator(2);
|
||||||
|
var leftIter = left.segments.iterator(2);
|
||||||
|
var rightIter = right.segments.iterator(2);
|
||||||
|
|
||||||
|
Path docsFile = Files.createTempFile(destDir, "docs", ".dat");
|
||||||
|
|
||||||
|
LongArray mergedDocuments = LongArrayFactory.mmapForWritingConfined(docsFile, left.documents.size() + right.documents.size());
|
||||||
|
|
||||||
|
leftIter.next();
|
||||||
|
rightIter.next();
|
||||||
|
|
||||||
|
try (FileChannel leftChannel = left.documents.createDocumentsFileChannel();
|
||||||
|
FileChannel rightChannel = right.documents.createDocumentsFileChannel())
|
||||||
|
{
|
||||||
|
|
||||||
|
while (mergingIter.canPutMore()
|
||||||
|
&& leftIter.isPositionBeforeEnd()
|
||||||
|
&& rightIter.isPositionBeforeEnd())
|
||||||
|
{
|
||||||
|
final long currentWord = mergingIter.wordId;
|
||||||
|
|
||||||
|
if (leftIter.wordId == currentWord && rightIter.wordId == currentWord)
|
||||||
|
{
|
||||||
|
// both inputs have documents for the current word
|
||||||
|
mergeSegments(leftIter, rightIter,
|
||||||
|
left.documents, right.documents,
|
||||||
|
mergedDocuments, mergingIter);
|
||||||
|
}
|
||||||
|
else if (leftIter.wordId == currentWord) {
|
||||||
|
if (!copySegment(leftIter, mergedDocuments, leftChannel, mergingIter))
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
else if (rightIter.wordId == currentWord) {
|
||||||
|
if (!copySegment(rightIter, mergedDocuments, rightChannel, mergingIter))
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
else assert false : "This should never happen"; // the helvetica scenario
|
||||||
|
}
|
||||||
|
|
||||||
|
if (leftIter.isPositionBeforeEnd()) {
|
||||||
|
while (copySegment(leftIter, mergedDocuments, leftChannel, mergingIter));
|
||||||
|
}
|
||||||
|
|
||||||
|
if (rightIter.isPositionBeforeEnd()) {
|
||||||
|
while (copySegment(rightIter, mergedDocuments, rightChannel, mergingIter));
|
||||||
|
}
|
||||||
|
|
||||||
|
}
|
||||||
|
|
||||||
|
if (leftIter.isPositionBeforeEnd())
|
||||||
|
throw new IllegalStateException("Left has more to go");
|
||||||
|
if (rightIter.isPositionBeforeEnd())
|
||||||
|
throw new IllegalStateException("Right has more to go");
|
||||||
|
if (mergingIter.canPutMore())
|
||||||
|
throw new IllegalStateException("Source iters ran dry before merging iter");
|
||||||
|
|
||||||
|
|
||||||
|
mergingSegment.force();
|
||||||
|
|
||||||
|
// We may have overestimated the size of the merged docs size in the case there were
|
||||||
|
// duplicates in the data, so we need to shrink it to the actual size we wrote.
|
||||||
|
|
||||||
|
mergedDocuments = shrinkMergedDocuments(mergedDocuments,
|
||||||
|
docsFile, 2 * mergingSegment.totalSize());
|
||||||
|
|
||||||
|
return new PrioPreindex(
|
||||||
|
mergingSegment,
|
||||||
|
new PrioPreindexDocuments(mergedDocuments, docsFile)
|
||||||
|
);
|
||||||
|
}
|
||||||
|
|
||||||
|
/** Create a segment word file with each word from both inputs, with zero counts for all the data.
|
||||||
|
* This is an intermediate product in merging.
|
||||||
|
*/
|
||||||
|
static PrioPreindexWordSegments createMergedSegmentWordFile(Path destDir,
|
||||||
|
PrioPreindexWordSegments left,
|
||||||
|
PrioPreindexWordSegments right) throws IOException {
|
||||||
|
Path segmentWordsFile = Files.createTempFile(destDir, "segment_words", ".dat");
|
||||||
|
Path segmentCountsFile = Files.createTempFile(destDir, "segment_counts", ".dat");
|
||||||
|
|
||||||
|
// We need total size to request a direct LongArray range. Seems slower, but is faster.
|
||||||
|
// ... see LongArray.directRangeIfPossible(long start, long end)
|
||||||
|
long segmentsSize = countDistinctElements(left.wordIds, right.wordIds,
|
||||||
|
0, left.wordIds.size(),
|
||||||
|
0, right.wordIds.size());
|
||||||
|
|
||||||
|
LongArray wordIdsFile = LongArrayFactory.mmapForWritingConfined(segmentWordsFile, segmentsSize);
|
||||||
|
|
||||||
|
mergeArrays(wordIdsFile, left.wordIds, right.wordIds,
|
||||||
|
0,
|
||||||
|
0, left.wordIds.size(),
|
||||||
|
0, right.wordIds.size());
|
||||||
|
|
||||||
|
LongArray counts = LongArrayFactory.mmapForWritingConfined(segmentCountsFile, segmentsSize);
|
||||||
|
|
||||||
|
return new PrioPreindexWordSegments(wordIdsFile, counts, segmentWordsFile, segmentCountsFile);
|
||||||
|
}
|
||||||
|
|
||||||
|
/** It's possible we overestimated the necessary size of the documents file,
|
||||||
|
* this will permit us to shrink it down to the smallest necessary size.
|
||||||
|
*/
|
||||||
|
private static LongArray shrinkMergedDocuments(LongArray mergedDocuments, Path docsFile, long sizeLongs) throws IOException {
|
||||||
|
|
||||||
|
mergedDocuments.force();
|
||||||
|
|
||||||
|
long beforeSize = mergedDocuments.size();
|
||||||
|
long afterSize = sizeLongs * 8;
|
||||||
|
if (beforeSize != afterSize) {
|
||||||
|
mergedDocuments.close();
|
||||||
|
try (var bc = Files.newByteChannel(docsFile, StandardOpenOption.WRITE)) {
|
||||||
|
bc.truncate(sizeLongs * 8);
|
||||||
|
}
|
||||||
|
|
||||||
|
logger.info("Shrunk {} from {}b to {}b", docsFile, beforeSize, afterSize);
|
||||||
|
mergedDocuments = LongArrayFactory.mmapForWritingConfined(docsFile, sizeLongs);
|
||||||
|
}
|
||||||
|
|
||||||
|
return mergedDocuments;
|
||||||
|
}
|
||||||
|
|
||||||
|
/** Merge contents of the segments indicated by leftIter and rightIter into the destionation
|
||||||
|
* segment, and advance the construction iterator with the appropriate size.
|
||||||
|
*/
|
||||||
|
private static void mergeSegments(PrioPreindexWordSegments.SegmentIterator leftIter,
|
||||||
|
PrioPreindexWordSegments.SegmentIterator rightIter,
|
||||||
|
PrioPreindexDocuments left,
|
||||||
|
PrioPreindexDocuments right,
|
||||||
|
LongArray dest,
|
||||||
|
PrioPreindexWordSegments.SegmentConstructionIterator destIter)
|
||||||
|
{
|
||||||
|
long segSize = mergeArrays2(dest,
|
||||||
|
left.documents,
|
||||||
|
right.documents,
|
||||||
|
destIter.startOffset,
|
||||||
|
leftIter.startOffset, leftIter.endOffset,
|
||||||
|
rightIter.startOffset, rightIter.endOffset);
|
||||||
|
|
||||||
|
long distinct = segSize / 2;
|
||||||
|
destIter.putNext(distinct);
|
||||||
|
leftIter.next();
|
||||||
|
rightIter.next();
|
||||||
|
}
|
||||||
|
|
||||||
|
/** Copy the data from the source segment at the position and length indicated by sourceIter,
|
||||||
|
* into the destination segment, and advance the construction iterator.
|
||||||
|
*/
|
||||||
|
private static boolean copySegment(PrioPreindexWordSegments.SegmentIterator sourceIter,
|
||||||
|
LongArray dest,
|
||||||
|
FileChannel sourceChannel,
|
||||||
|
PrioPreindexWordSegments.SegmentConstructionIterator mergingIter) throws IOException {
|
||||||
|
|
||||||
|
long size = sourceIter.endOffset - sourceIter.startOffset;
|
||||||
|
long start = mergingIter.startOffset;
|
||||||
|
long end = start + size;
|
||||||
|
|
||||||
|
dest.transferFrom(sourceChannel,
|
||||||
|
sourceIter.startOffset,
|
||||||
|
mergingIter.startOffset,
|
||||||
|
end);
|
||||||
|
|
||||||
|
boolean putNext = mergingIter.putNext(size / 2);
|
||||||
|
boolean iterNext = sourceIter.next();
|
||||||
|
|
||||||
|
if (!putNext && iterNext)
|
||||||
|
throw new IllegalStateException("Source iterator ran out before dest iterator?!");
|
||||||
|
|
||||||
|
return iterNext;
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
}
|
@ -0,0 +1,141 @@
|
|||||||
|
package nu.marginalia.index.construction.prio;
|
||||||
|
|
||||||
|
import lombok.SneakyThrows;
|
||||||
|
import nu.marginalia.array.LongArray;
|
||||||
|
import nu.marginalia.array.LongArrayFactory;
|
||||||
|
import nu.marginalia.index.construction.DocIdRewriter;
|
||||||
|
import nu.marginalia.index.construction.PositionsFileConstructor;
|
||||||
|
import nu.marginalia.index.journal.reader.IndexJournalReader;
|
||||||
|
import nu.marginalia.rwf.RandomFileAssembler;
|
||||||
|
import org.slf4j.Logger;
|
||||||
|
import org.slf4j.LoggerFactory;
|
||||||
|
|
||||||
|
import java.io.IOException;
|
||||||
|
import java.nio.channels.FileChannel;
|
||||||
|
import java.nio.file.Files;
|
||||||
|
import java.nio.file.Path;
|
||||||
|
import java.nio.file.StandardOpenOption;
|
||||||
|
import java.util.concurrent.ExecutorService;
|
||||||
|
import java.util.concurrent.Executors;
|
||||||
|
import java.util.concurrent.TimeUnit;
|
||||||
|
|
||||||
|
/** A LongArray with document data, segmented according to
|
||||||
|
* the associated ReversePreindexWordSegments data
|
||||||
|
*/
|
||||||
|
public class PrioPreindexDocuments {
|
||||||
|
public final LongArray documents;
|
||||||
|
|
||||||
|
private static PositionsFileConstructor positionsFileConstructor;
|
||||||
|
private static final int RECORD_SIZE_LONGS = 2;
|
||||||
|
private static final Logger logger = LoggerFactory.getLogger(PrioPreindexDocuments.class);
|
||||||
|
|
||||||
|
public final Path file;
|
||||||
|
|
||||||
|
public PrioPreindexDocuments(LongArray documents, Path file) {
|
||||||
|
this.documents = documents;
|
||||||
|
this.file = file;
|
||||||
|
}
|
||||||
|
|
||||||
|
public static PrioPreindexDocuments construct(
|
||||||
|
Path docsFile,
|
||||||
|
Path workDir,
|
||||||
|
IndexJournalReader reader,
|
||||||
|
DocIdRewriter docIdRewriter,
|
||||||
|
PositionsFileConstructor positionsFileConstructor,
|
||||||
|
PrioPreindexWordSegments segments) throws IOException {
|
||||||
|
PrioPreindexDocuments.positionsFileConstructor = positionsFileConstructor;
|
||||||
|
|
||||||
|
createUnsortedDocsFile(docsFile, workDir, reader, segments, docIdRewriter);
|
||||||
|
|
||||||
|
LongArray docsFileMap = LongArrayFactory.mmapForModifyingShared(docsFile);
|
||||||
|
sortDocsFile(docsFileMap, segments);
|
||||||
|
|
||||||
|
return new PrioPreindexDocuments(docsFileMap, docsFile);
|
||||||
|
}
|
||||||
|
|
||||||
|
public FileChannel createDocumentsFileChannel() throws IOException {
|
||||||
|
return (FileChannel) Files.newByteChannel(file, StandardOpenOption.READ);
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
public LongArray slice(long start, long end) {
|
||||||
|
return documents.range(start, end);
|
||||||
|
}
|
||||||
|
|
||||||
|
public long size() {
|
||||||
|
return documents.size();
|
||||||
|
}
|
||||||
|
|
||||||
|
private static void createUnsortedDocsFile(Path docsFile,
|
||||||
|
Path workDir,
|
||||||
|
IndexJournalReader reader,
|
||||||
|
PrioPreindexWordSegments segments,
|
||||||
|
DocIdRewriter docIdRewriter) throws IOException {
|
||||||
|
|
||||||
|
long fileSizeLongs = RECORD_SIZE_LONGS * segments.totalSize();
|
||||||
|
|
||||||
|
try (var assembly = RandomFileAssembler.create(workDir, fileSizeLongs);
|
||||||
|
var pointer = reader.newPointer())
|
||||||
|
{
|
||||||
|
|
||||||
|
var offsetMap = segments.asMap(RECORD_SIZE_LONGS);
|
||||||
|
offsetMap.defaultReturnValue(0);
|
||||||
|
|
||||||
|
while (pointer.nextDocument()) {
|
||||||
|
long rankEncodedId = docIdRewriter.rewriteDocId(pointer.documentId());
|
||||||
|
for (var termData : pointer) {
|
||||||
|
long termId = termData.termId();
|
||||||
|
|
||||||
|
long offset = offsetMap.addTo(termId, RECORD_SIZE_LONGS);
|
||||||
|
|
||||||
|
// write position data to the positions file and get the offset
|
||||||
|
long encodedPosOffset = positionsFileConstructor.add((byte) termData.metadata(), termData.positionsBuffer());
|
||||||
|
|
||||||
|
assembly.put(offset + 0, rankEncodedId);
|
||||||
|
assembly.put(offset + 1, encodedPosOffset);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
assembly.write(docsFile);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
@SneakyThrows
|
||||||
|
private static void sortDocsFile(LongArray docsFileMap, PrioPreindexWordSegments segments) throws IOException {
|
||||||
|
|
||||||
|
var iter = segments.iterator(RECORD_SIZE_LONGS);
|
||||||
|
|
||||||
|
ExecutorService sortingWorkers = Executors.newWorkStealingPool(Runtime.getRuntime().availableProcessors());
|
||||||
|
|
||||||
|
while (iter.next()) {
|
||||||
|
long iterStart = iter.startOffset;
|
||||||
|
long iterEnd = iter.endOffset;
|
||||||
|
|
||||||
|
if (iter.size() < 1024) {
|
||||||
|
docsFileMap.quickSortN(RECORD_SIZE_LONGS, iterStart, iterEnd);
|
||||||
|
}
|
||||||
|
else {
|
||||||
|
sortingWorkers.execute(() ->
|
||||||
|
docsFileMap.quickSortN(RECORD_SIZE_LONGS, iterStart, iterEnd));
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
sortingWorkers.shutdown();
|
||||||
|
while (!sortingWorkers.awaitTermination(1, TimeUnit.HOURS));
|
||||||
|
|
||||||
|
sortingWorkers.close();
|
||||||
|
}
|
||||||
|
|
||||||
|
public void delete() throws IOException {
|
||||||
|
Files.delete(this.file);
|
||||||
|
documents.close();
|
||||||
|
}
|
||||||
|
|
||||||
|
public void close() {
|
||||||
|
documents.close();
|
||||||
|
}
|
||||||
|
|
||||||
|
public void force() {
|
||||||
|
documents.force();
|
||||||
|
}
|
||||||
|
}
|
@ -0,0 +1,36 @@
|
|||||||
|
package nu.marginalia.index.construction.prio;
|
||||||
|
|
||||||
|
import nu.marginalia.array.LongArrayFactory;
|
||||||
|
|
||||||
|
import java.io.IOException;
|
||||||
|
import java.nio.file.Path;
|
||||||
|
|
||||||
|
/** This is a dehydrated version of a PrioPreIndex, that only
|
||||||
|
* keeps references to its location on disk but does not hold associated
|
||||||
|
* memory maps.
|
||||||
|
*/
|
||||||
|
public record PrioPreindexReference(
|
||||||
|
Path wordsFile,
|
||||||
|
Path countsFile,
|
||||||
|
Path documentsFile
|
||||||
|
)
|
||||||
|
{
|
||||||
|
public PrioPreindexReference(PrioPreindexWordSegments segments, PrioPreindexDocuments documents) {
|
||||||
|
this(segments.wordsFile, segments.countsFile, documents.file);
|
||||||
|
}
|
||||||
|
|
||||||
|
public PrioPreindex open() throws IOException {
|
||||||
|
return new PrioPreindex(
|
||||||
|
new PrioPreindexWordSegments(
|
||||||
|
LongArrayFactory.mmapForModifyingShared(wordsFile),
|
||||||
|
LongArrayFactory.mmapForModifyingShared(countsFile),
|
||||||
|
wordsFile,
|
||||||
|
countsFile
|
||||||
|
),
|
||||||
|
new PrioPreindexDocuments(
|
||||||
|
LongArrayFactory.mmapForModifyingShared(documentsFile),
|
||||||
|
documentsFile
|
||||||
|
)
|
||||||
|
);
|
||||||
|
}
|
||||||
|
}
|
@ -0,0 +1,205 @@
|
|||||||
|
package nu.marginalia.index.construction.prio;
|
||||||
|
|
||||||
|
import it.unimi.dsi.fastutil.longs.Long2IntOpenHashMap;
|
||||||
|
import it.unimi.dsi.fastutil.longs.Long2LongOpenHashMap;
|
||||||
|
import it.unimi.dsi.fastutil.longs.LongIterator;
|
||||||
|
import nu.marginalia.array.LongArray;
|
||||||
|
import nu.marginalia.array.LongArrayFactory;
|
||||||
|
import nu.marginalia.index.journal.reader.IndexJournalReader;
|
||||||
|
|
||||||
|
import java.io.IOException;
|
||||||
|
import java.nio.file.Files;
|
||||||
|
import java.nio.file.Path;
|
||||||
|
|
||||||
|
/** A pair of file-backed arrays of sorted wordIds
|
||||||
|
* and the count of documents associated with each termId.
|
||||||
|
*/
|
||||||
|
public class PrioPreindexWordSegments {
|
||||||
|
public final LongArray wordIds;
|
||||||
|
public final LongArray counts;
|
||||||
|
|
||||||
|
final Path wordsFile;
|
||||||
|
final Path countsFile;
|
||||||
|
|
||||||
|
public PrioPreindexWordSegments(LongArray wordIds,
|
||||||
|
LongArray counts,
|
||||||
|
Path wordsFile,
|
||||||
|
Path countsFile)
|
||||||
|
{
|
||||||
|
assert wordIds.size() == counts.size();
|
||||||
|
|
||||||
|
this.wordIds = wordIds;
|
||||||
|
this.counts = counts;
|
||||||
|
this.wordsFile = wordsFile;
|
||||||
|
this.countsFile = countsFile;
|
||||||
|
}
|
||||||
|
|
||||||
|
/** Returns a long-long hash map where each key is a termId,
|
||||||
|
* and each value is the start offset of the data.
|
||||||
|
*/
|
||||||
|
public Long2LongOpenHashMap asMap(int recordSize) {
|
||||||
|
if (wordIds.size() > Integer.MAX_VALUE)
|
||||||
|
throw new IllegalArgumentException("Cannot create a map with more than Integer.MAX_VALUE entries");
|
||||||
|
|
||||||
|
Long2LongOpenHashMap ret = new Long2LongOpenHashMap((int) wordIds.size(), 0.75f);
|
||||||
|
var iter = iterator(recordSize);
|
||||||
|
|
||||||
|
while (iter.next()) {
|
||||||
|
ret.put(iter.wordId, iter.startOffset);
|
||||||
|
}
|
||||||
|
|
||||||
|
return ret;
|
||||||
|
}
|
||||||
|
|
||||||
|
public static PrioPreindexWordSegments construct(IndexJournalReader reader,
|
||||||
|
Path wordIdsFile,
|
||||||
|
Path countsFile)
|
||||||
|
throws IOException
|
||||||
|
{
|
||||||
|
Long2IntOpenHashMap countsMap = new Long2IntOpenHashMap(100_000, 0.75f);
|
||||||
|
countsMap.defaultReturnValue(0);
|
||||||
|
reader.forEachWordId(wordId -> countsMap.addTo(wordId, 1));
|
||||||
|
|
||||||
|
LongArray words = LongArrayFactory.mmapForWritingConfined(wordIdsFile, countsMap.size());
|
||||||
|
LongArray counts = LongArrayFactory.mmapForWritingConfined(countsFile, countsMap.size());
|
||||||
|
|
||||||
|
// Create the words file by iterating over the map and inserting them into
|
||||||
|
// the words file in whatever bizarro hash table order they appear in
|
||||||
|
long i = 0;
|
||||||
|
LongIterator iter = countsMap.keySet().iterator();
|
||||||
|
while (iter.hasNext()) {
|
||||||
|
words.set(i++, iter.nextLong());
|
||||||
|
}
|
||||||
|
|
||||||
|
// Sort the words file
|
||||||
|
words.sort(0, counts.size());
|
||||||
|
|
||||||
|
// Populate the counts
|
||||||
|
for (i = 0; i < countsMap.size(); i++) {
|
||||||
|
counts.set(i, countsMap.get(words.get(i)));
|
||||||
|
}
|
||||||
|
|
||||||
|
return new PrioPreindexWordSegments(words, counts, wordIdsFile, countsFile);
|
||||||
|
}
|
||||||
|
|
||||||
|
public SegmentIterator iterator(int recordSize) {
|
||||||
|
return new SegmentIterator(recordSize);
|
||||||
|
}
|
||||||
|
public SegmentConstructionIterator constructionIterator(int recordSize) {
|
||||||
|
return new SegmentConstructionIterator(recordSize);
|
||||||
|
}
|
||||||
|
|
||||||
|
public long totalSize() {
|
||||||
|
return counts.fold(0, 0, counts.size(), Long::sum);
|
||||||
|
}
|
||||||
|
|
||||||
|
public void delete() throws IOException {
|
||||||
|
Files.delete(countsFile);
|
||||||
|
Files.delete(wordsFile);
|
||||||
|
|
||||||
|
counts.close();
|
||||||
|
wordIds.close();
|
||||||
|
}
|
||||||
|
|
||||||
|
public void force() {
|
||||||
|
counts.force();
|
||||||
|
wordIds.force();
|
||||||
|
}
|
||||||
|
|
||||||
|
public void close() {
|
||||||
|
wordIds.close();
|
||||||
|
counts.close();
|
||||||
|
}
|
||||||
|
|
||||||
|
public class SegmentIterator {
|
||||||
|
private final int recordSize;
|
||||||
|
private final long fileSize;
|
||||||
|
long wordId;
|
||||||
|
long startOffset = 0;
|
||||||
|
long endOffset = 0;
|
||||||
|
|
||||||
|
private SegmentIterator(int recordSize) {
|
||||||
|
this.recordSize = recordSize;
|
||||||
|
this.fileSize = wordIds.size();
|
||||||
|
}
|
||||||
|
|
||||||
|
private long i = -1;
|
||||||
|
public long idx() {
|
||||||
|
return i;
|
||||||
|
}
|
||||||
|
public boolean next() {
|
||||||
|
if (++i >= fileSize) {
|
||||||
|
wordId = Long.MIN_VALUE;
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
|
||||||
|
wordId = wordIds.get(i);
|
||||||
|
startOffset = endOffset;
|
||||||
|
endOffset = startOffset + recordSize * counts.get(i);
|
||||||
|
|
||||||
|
return true;
|
||||||
|
}
|
||||||
|
|
||||||
|
public boolean hasMorePositions() {
|
||||||
|
return i + 1 < wordIds.size();
|
||||||
|
}
|
||||||
|
|
||||||
|
public boolean isPositionBeforeEnd() {
|
||||||
|
return i < wordIds.size();
|
||||||
|
}
|
||||||
|
|
||||||
|
public long size() {
|
||||||
|
return endOffset - startOffset;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
class SegmentConstructionIterator {
|
||||||
|
private final int recordSize;
|
||||||
|
private final long fileSize;
|
||||||
|
long wordId;
|
||||||
|
long startOffset = 0;
|
||||||
|
long endOffset = 0;
|
||||||
|
|
||||||
|
private SegmentConstructionIterator(int recordSize) {
|
||||||
|
this.recordSize = recordSize;
|
||||||
|
this.fileSize = wordIds.size();
|
||||||
|
if (fileSize == 0) {
|
||||||
|
throw new IllegalArgumentException("Cannot construct zero-length word segment file");
|
||||||
|
}
|
||||||
|
this.wordId = wordIds.get(0);
|
||||||
|
}
|
||||||
|
|
||||||
|
private long i = 0;
|
||||||
|
public long idx() {
|
||||||
|
return i;
|
||||||
|
}
|
||||||
|
|
||||||
|
public boolean putNext(long size) {
|
||||||
|
|
||||||
|
if (i >= fileSize)
|
||||||
|
return false;
|
||||||
|
|
||||||
|
endOffset = startOffset + recordSize * size;
|
||||||
|
counts.set(i, size);
|
||||||
|
startOffset = endOffset;
|
||||||
|
endOffset = -1;
|
||||||
|
|
||||||
|
i++;
|
||||||
|
|
||||||
|
if (i == fileSize) {
|
||||||
|
// We've reached the end of the iteration and there is no
|
||||||
|
// "next" termId to fetch
|
||||||
|
wordId = Long.MIN_VALUE;
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
else {
|
||||||
|
wordId = wordIds.get(i);
|
||||||
|
return true;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
public boolean canPutMore() {
|
||||||
|
return i < wordIds.size();
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
@ -4,9 +4,9 @@ import it.unimi.dsi.fastutil.ints.IntList;
|
|||||||
import nu.marginalia.array.page.LongQueryBuffer;
|
import nu.marginalia.array.page.LongQueryBuffer;
|
||||||
import nu.marginalia.index.construction.DocIdRewriter;
|
import nu.marginalia.index.construction.DocIdRewriter;
|
||||||
import nu.marginalia.index.construction.PositionsFileConstructor;
|
import nu.marginalia.index.construction.PositionsFileConstructor;
|
||||||
import nu.marginalia.index.construction.ReversePreindex;
|
import nu.marginalia.index.construction.full.FullPreindex;
|
||||||
import nu.marginalia.index.construction.TestJournalFactory;
|
import nu.marginalia.index.construction.full.TestJournalFactory;
|
||||||
import nu.marginalia.index.construction.TestJournalFactory.EntryDataWithWordMeta;
|
import nu.marginalia.index.construction.full.TestJournalFactory.EntryDataWithWordMeta;
|
||||||
import nu.marginalia.index.positions.PositionsFileReader;
|
import nu.marginalia.index.positions.PositionsFileReader;
|
||||||
import org.junit.jupiter.api.AfterEach;
|
import org.junit.jupiter.api.AfterEach;
|
||||||
import org.junit.jupiter.api.BeforeEach;
|
import org.junit.jupiter.api.BeforeEach;
|
||||||
@ -19,7 +19,7 @@ import java.nio.file.Path;
|
|||||||
import java.util.ArrayList;
|
import java.util.ArrayList;
|
||||||
import java.util.List;
|
import java.util.List;
|
||||||
|
|
||||||
import static nu.marginalia.index.construction.TestJournalFactory.wm;
|
import static nu.marginalia.index.construction.full.TestJournalFactory.wm;
|
||||||
import static org.junit.jupiter.api.Assertions.*;
|
import static org.junit.jupiter.api.Assertions.*;
|
||||||
|
|
||||||
class ReverseIndexReaderTest {
|
class ReverseIndexReaderTest {
|
||||||
@ -99,7 +99,7 @@ class ReverseIndexReaderTest {
|
|||||||
Path wordsFile = tempDir.resolve("words.dat");
|
Path wordsFile = tempDir.resolve("words.dat");
|
||||||
|
|
||||||
try (var positionsFileConstructor = new PositionsFileConstructor(posFile)) {
|
try (var positionsFileConstructor = new PositionsFileConstructor(posFile)) {
|
||||||
var preindex = ReversePreindex.constructPreindex(reader,
|
var preindex = FullPreindex.constructPreindex(reader,
|
||||||
positionsFileConstructor,
|
positionsFileConstructor,
|
||||||
DocIdRewriter.identity(), tempDir);
|
DocIdRewriter.identity(), tempDir);
|
||||||
preindex.finalizeIndex(docsFile, wordsFile);
|
preindex.finalizeIndex(docsFile, wordsFile);
|
||||||
|
@ -1,5 +1,7 @@
|
|||||||
package nu.marginalia.index.construction;
|
package nu.marginalia.index.construction.full;
|
||||||
|
|
||||||
|
import nu.marginalia.index.construction.DocIdRewriter;
|
||||||
|
import nu.marginalia.index.construction.PositionsFileConstructor;
|
||||||
import org.junit.jupiter.api.AfterEach;
|
import org.junit.jupiter.api.AfterEach;
|
||||||
import org.junit.jupiter.api.BeforeEach;
|
import org.junit.jupiter.api.BeforeEach;
|
||||||
import org.junit.jupiter.api.Test;
|
import org.junit.jupiter.api.Test;
|
||||||
@ -11,10 +13,10 @@ import java.util.ArrayList;
|
|||||||
import java.util.Arrays;
|
import java.util.Arrays;
|
||||||
import java.util.List;
|
import java.util.List;
|
||||||
|
|
||||||
import static nu.marginalia.index.construction.TestJournalFactory.EntryData;
|
import static nu.marginalia.index.construction.full.TestJournalFactory.EntryData;
|
||||||
import static org.junit.jupiter.api.Assertions.assertEquals;
|
import static org.junit.jupiter.api.Assertions.assertEquals;
|
||||||
|
|
||||||
class ReversePreindexDocsTest {
|
class FullPreindexDocsTest {
|
||||||
Path countsFile;
|
Path countsFile;
|
||||||
Path wordsIdFile;
|
Path wordsIdFile;
|
||||||
Path docsFile;
|
Path docsFile;
|
||||||
@ -57,8 +59,8 @@ class ReversePreindexDocsTest {
|
|||||||
new EntryData(-0xF00BA3L, 0, 10, 40, -100, 33)
|
new EntryData(-0xF00BA3L, 0, 10, 40, -100, 33)
|
||||||
);
|
);
|
||||||
|
|
||||||
var segments = ReversePreindexWordSegments.construct(reader, wordsIdFile, countsFile);
|
var segments = FullPreindexWordSegments.construct(reader, wordsIdFile, countsFile);
|
||||||
var docs = ReversePreindexDocuments.construct(docsFile, tempDir, reader, DocIdRewriter.identity(), new PositionsFileConstructor(positionsFile), segments);
|
var docs = FullPreindexDocuments.construct(docsFile, tempDir, reader, DocIdRewriter.identity(), new PositionsFileConstructor(positionsFile), segments);
|
||||||
|
|
||||||
List<TestSegmentData> expected = List.of(
|
List<TestSegmentData> expected = List.of(
|
||||||
new TestSegmentData(-100, 0, 2, new long[] { -0xF00BA3L, 0 }),
|
new TestSegmentData(-100, 0, 2, new long[] { -0xF00BA3L, 0 }),
|
||||||
@ -86,8 +88,8 @@ class ReversePreindexDocsTest {
|
|||||||
new EntryData(-0xF00BA3L, 0, 4, 4)
|
new EntryData(-0xF00BA3L, 0, 4, 4)
|
||||||
);
|
);
|
||||||
|
|
||||||
var segments = ReversePreindexWordSegments.construct(reader, wordsIdFile, countsFile);
|
var segments = FullPreindexWordSegments.construct(reader, wordsIdFile, countsFile);
|
||||||
var docs = ReversePreindexDocuments.construct(docsFile, tempDir, reader, DocIdRewriter.identity(),
|
var docs = FullPreindexDocuments.construct(docsFile, tempDir, reader, DocIdRewriter.identity(),
|
||||||
new PositionsFileConstructor(positionsFile),
|
new PositionsFileConstructor(positionsFile),
|
||||||
segments);
|
segments);
|
||||||
|
|
||||||
@ -115,8 +117,8 @@ class ReversePreindexDocsTest {
|
|||||||
new EntryData(0xF00BA4L, 0, 15, 30, -100, 33)
|
new EntryData(0xF00BA4L, 0, 15, 30, -100, 33)
|
||||||
);
|
);
|
||||||
|
|
||||||
var segments = ReversePreindexWordSegments.construct(reader, wordsIdFile, countsFile);
|
var segments = FullPreindexWordSegments.construct(reader, wordsIdFile, countsFile);
|
||||||
var docs = ReversePreindexDocuments.construct(docsFile, tempDir, reader, DocIdRewriter.identity(),
|
var docs = FullPreindexDocuments.construct(docsFile, tempDir, reader, DocIdRewriter.identity(),
|
||||||
new PositionsFileConstructor(positionsFile),
|
new PositionsFileConstructor(positionsFile),
|
||||||
segments);
|
segments);
|
||||||
|
|
@ -1,8 +1,10 @@
|
|||||||
|
|
||||||
package nu.marginalia.index.construction;
|
package nu.marginalia.index.construction.full;
|
||||||
|
|
||||||
import nu.marginalia.array.LongArrayFactory;
|
import nu.marginalia.array.LongArrayFactory;
|
||||||
import nu.marginalia.btree.model.BTreeHeader;
|
import nu.marginalia.btree.model.BTreeHeader;
|
||||||
|
import nu.marginalia.index.construction.DocIdRewriter;
|
||||||
|
import nu.marginalia.index.construction.PositionsFileConstructor;
|
||||||
import org.junit.jupiter.api.AfterEach;
|
import org.junit.jupiter.api.AfterEach;
|
||||||
import org.junit.jupiter.api.BeforeEach;
|
import org.junit.jupiter.api.BeforeEach;
|
||||||
import org.junit.jupiter.api.Test;
|
import org.junit.jupiter.api.Test;
|
||||||
@ -12,11 +14,11 @@ import java.nio.file.Files;
|
|||||||
import java.nio.file.Path;
|
import java.nio.file.Path;
|
||||||
import java.util.*;
|
import java.util.*;
|
||||||
|
|
||||||
import static nu.marginalia.index.construction.TestJournalFactory.*;
|
import static nu.marginalia.index.construction.full.TestJournalFactory.*;
|
||||||
import static org.junit.jupiter.api.Assertions.assertEquals;
|
import static org.junit.jupiter.api.Assertions.assertEquals;
|
||||||
import static org.junit.jupiter.api.Assertions.assertTrue;
|
import static org.junit.jupiter.api.Assertions.assertTrue;
|
||||||
|
|
||||||
class ReversePreindexFinalizeTest {
|
class FullPreindexFinalizeTest {
|
||||||
TestJournalFactory journalFactory;
|
TestJournalFactory journalFactory;
|
||||||
Path positionsFile;
|
Path positionsFile;
|
||||||
Path countsFile;
|
Path countsFile;
|
||||||
@ -52,7 +54,7 @@ class ReversePreindexFinalizeTest {
|
|||||||
@Test
|
@Test
|
||||||
public void testFinalizeSimple() throws IOException {
|
public void testFinalizeSimple() throws IOException {
|
||||||
var reader = journalFactory.createReader(new EntryDataWithWordMeta(100, 101, wm(50, 51)));
|
var reader = journalFactory.createReader(new EntryDataWithWordMeta(100, 101, wm(50, 51)));
|
||||||
var preindex = ReversePreindex.constructPreindex(reader,
|
var preindex = FullPreindex.constructPreindex(reader,
|
||||||
new PositionsFileConstructor(positionsFile),
|
new PositionsFileConstructor(positionsFile),
|
||||||
DocIdRewriter.identity(), tempDir);
|
DocIdRewriter.identity(), tempDir);
|
||||||
|
|
||||||
@ -90,7 +92,7 @@ class ReversePreindexFinalizeTest {
|
|||||||
new EntryDataWithWordMeta(101, 101, wm(51, 52))
|
new EntryDataWithWordMeta(101, 101, wm(51, 52))
|
||||||
);
|
);
|
||||||
|
|
||||||
var preindex = ReversePreindex.constructPreindex(reader,
|
var preindex = FullPreindex.constructPreindex(reader,
|
||||||
new PositionsFileConstructor(positionsFile),
|
new PositionsFileConstructor(positionsFile),
|
||||||
DocIdRewriter.identity(), tempDir);
|
DocIdRewriter.identity(), tempDir);
|
||||||
|
|
@ -1,6 +1,8 @@
|
|||||||
|
|
||||||
package nu.marginalia.index.construction;
|
package nu.marginalia.index.construction.full;
|
||||||
|
|
||||||
|
import nu.marginalia.index.construction.DocIdRewriter;
|
||||||
|
import nu.marginalia.index.construction.PositionsFileConstructor;
|
||||||
import org.junit.jupiter.api.AfterEach;
|
import org.junit.jupiter.api.AfterEach;
|
||||||
import org.junit.jupiter.api.BeforeEach;
|
import org.junit.jupiter.api.BeforeEach;
|
||||||
import org.junit.jupiter.api.Test;
|
import org.junit.jupiter.api.Test;
|
||||||
@ -10,10 +12,10 @@ import java.nio.file.Files;
|
|||||||
import java.nio.file.Path;
|
import java.nio.file.Path;
|
||||||
import java.util.*;
|
import java.util.*;
|
||||||
|
|
||||||
import static nu.marginalia.index.construction.TestJournalFactory.*;
|
import static nu.marginalia.index.construction.full.TestJournalFactory.*;
|
||||||
import static org.junit.jupiter.api.Assertions.assertEquals;
|
import static org.junit.jupiter.api.Assertions.assertEquals;
|
||||||
|
|
||||||
class ReversePreindexMergeTest {
|
class FullPreindexMergeTest {
|
||||||
TestJournalFactory journalFactory;
|
TestJournalFactory journalFactory;
|
||||||
Path countsFile;
|
Path countsFile;
|
||||||
Path wordsIdFile;
|
Path wordsIdFile;
|
||||||
@ -46,19 +48,19 @@ class ReversePreindexMergeTest {
|
|||||||
Files.delete(tempDir);
|
Files.delete(tempDir);
|
||||||
}
|
}
|
||||||
|
|
||||||
public ReversePreindex runMergeScenario(
|
public FullPreindex runMergeScenario(
|
||||||
List<EntryDataWithWordMeta> leftData,
|
List<EntryDataWithWordMeta> leftData,
|
||||||
List<EntryDataWithWordMeta> rightData
|
List<EntryDataWithWordMeta> rightData
|
||||||
) throws IOException {
|
) throws IOException {
|
||||||
var reader1 = journalFactory.createReader(leftData.toArray(EntryDataWithWordMeta[]::new));
|
var reader1 = journalFactory.createReader(leftData.toArray(EntryDataWithWordMeta[]::new));
|
||||||
var reader2 = journalFactory.createReader(rightData.toArray(EntryDataWithWordMeta[]::new));
|
var reader2 = journalFactory.createReader(rightData.toArray(EntryDataWithWordMeta[]::new));
|
||||||
|
|
||||||
var left = ReversePreindex.constructPreindex(reader1, new PositionsFileConstructor(positionsFile), DocIdRewriter.identity(), tempDir);
|
var left = FullPreindex.constructPreindex(reader1, new PositionsFileConstructor(positionsFile), DocIdRewriter.identity(), tempDir);
|
||||||
var right = ReversePreindex.constructPreindex(reader2, new PositionsFileConstructor(positionsFile), DocIdRewriter.identity(), tempDir);
|
var right = FullPreindex.constructPreindex(reader2, new PositionsFileConstructor(positionsFile), DocIdRewriter.identity(), tempDir);
|
||||||
return ReversePreindex.merge(tempDir, left, right);
|
return FullPreindex.merge(tempDir, left, right);
|
||||||
}
|
}
|
||||||
|
|
||||||
private List<TestSegmentData> getData(ReversePreindex merged) {
|
private List<TestSegmentData> getData(FullPreindex merged) {
|
||||||
var iter = merged.segments.iterator(2);
|
var iter = merged.segments.iterator(2);
|
||||||
List<TestSegmentData> actual = new ArrayList<>();
|
List<TestSegmentData> actual = new ArrayList<>();
|
||||||
while (iter.next()) {
|
while (iter.next()) {
|
@ -1,4 +1,4 @@
|
|||||||
package nu.marginalia.index.construction;
|
package nu.marginalia.index.construction.full;
|
||||||
|
|
||||||
import nu.marginalia.array.LongArray;
|
import nu.marginalia.array.LongArray;
|
||||||
import org.junit.jupiter.api.AfterEach;
|
import org.junit.jupiter.api.AfterEach;
|
||||||
@ -11,10 +11,10 @@ import java.nio.file.Path;
|
|||||||
import java.util.ArrayList;
|
import java.util.ArrayList;
|
||||||
import java.util.List;
|
import java.util.List;
|
||||||
|
|
||||||
import static nu.marginalia.index.construction.TestJournalFactory.*;
|
import static nu.marginalia.index.construction.full.TestJournalFactory.*;
|
||||||
import static org.junit.jupiter.api.Assertions.*;
|
import static org.junit.jupiter.api.Assertions.*;
|
||||||
|
|
||||||
class ReversePreindexWordSegmentsTest {
|
class FullPreindexWordSegmentsTest {
|
||||||
Path countsFile;
|
Path countsFile;
|
||||||
Path wordsIdFile;
|
Path wordsIdFile;
|
||||||
Path docsFile;
|
Path docsFile;
|
||||||
@ -51,7 +51,7 @@ class ReversePreindexWordSegmentsTest {
|
|||||||
new EntryData(-0xF00BA3L, 0, 1L<<33)
|
new EntryData(-0xF00BA3L, 0, 1L<<33)
|
||||||
);
|
);
|
||||||
|
|
||||||
var segments = ReversePreindexWordSegments.construct(reader, wordsIdFile, countsFile);
|
var segments = FullPreindexWordSegments.construct(reader, wordsIdFile, countsFile);
|
||||||
var iter = segments.iterator(1);
|
var iter = segments.iterator(1);
|
||||||
|
|
||||||
List<TestSegmentData> expected = List.of(
|
List<TestSegmentData> expected = List.of(
|
||||||
@ -72,7 +72,7 @@ class ReversePreindexWordSegmentsTest {
|
|||||||
new EntryData(-0xF00BA3L, 0, 5, 5)
|
new EntryData(-0xF00BA3L, 0, 5, 5)
|
||||||
);
|
);
|
||||||
|
|
||||||
var segments = ReversePreindexWordSegments.construct(reader, wordsIdFile, countsFile);
|
var segments = FullPreindexWordSegments.construct(reader, wordsIdFile, countsFile);
|
||||||
var iter = segments.iterator(1);
|
var iter = segments.iterator(1);
|
||||||
|
|
||||||
List<TestSegmentData> expected = List.of(
|
List<TestSegmentData> expected = List.of(
|
||||||
@ -94,7 +94,7 @@ class ReversePreindexWordSegmentsTest {
|
|||||||
new EntryData(-0xF00BA3L, 0, 10, 40, -100, 33)
|
new EntryData(-0xF00BA3L, 0, 10, 40, -100, 33)
|
||||||
);
|
);
|
||||||
|
|
||||||
var segments = ReversePreindexWordSegments.construct(reader, wordsIdFile, countsFile);
|
var segments = FullPreindexWordSegments.construct(reader, wordsIdFile, countsFile);
|
||||||
var iter = segments.iterator(1);
|
var iter = segments.iterator(1);
|
||||||
|
|
||||||
List<TestSegmentData> expected = List.of(
|
List<TestSegmentData> expected = List.of(
|
||||||
@ -120,7 +120,7 @@ class ReversePreindexWordSegmentsTest {
|
|||||||
new EntryData(0xF00BA4L, 0, 15, 30, -100, 33)
|
new EntryData(0xF00BA4L, 0, 15, 30, -100, 33)
|
||||||
);
|
);
|
||||||
|
|
||||||
var segments = ReversePreindexWordSegments.construct(reader, wordsIdFile, countsFile);
|
var segments = FullPreindexWordSegments.construct(reader, wordsIdFile, countsFile);
|
||||||
var iter = segments.iterator(1);
|
var iter = segments.iterator(1);
|
||||||
|
|
||||||
List<TestSegmentData> expected = List.of(
|
List<TestSegmentData> expected = List.of(
|
||||||
@ -148,7 +148,7 @@ class ReversePreindexWordSegmentsTest {
|
|||||||
LongArray countsArray = LongArray.allocate(4);
|
LongArray countsArray = LongArray.allocate(4);
|
||||||
wordsArray.set(0, -1, -2, -3, -4);
|
wordsArray.set(0, -1, -2, -3, -4);
|
||||||
countsArray.set(0, 2, 1, 3, 5);
|
countsArray.set(0, 2, 1, 3, 5);
|
||||||
var segments = new ReversePreindexWordSegments(wordsArray, countsArray, null, null);
|
var segments = new FullPreindexWordSegments(wordsArray, countsArray, null, null);
|
||||||
|
|
||||||
var ritr = segments.iterator(1);
|
var ritr = segments.iterator(1);
|
||||||
assertTrue(ritr.hasMorePositions());
|
assertTrue(ritr.hasMorePositions());
|
||||||
@ -196,7 +196,7 @@ class ReversePreindexWordSegmentsTest {
|
|||||||
LongArray wordsArray = LongArray.allocate(4);
|
LongArray wordsArray = LongArray.allocate(4);
|
||||||
LongArray countsArray = LongArray.allocate(4);
|
LongArray countsArray = LongArray.allocate(4);
|
||||||
wordsArray.set(0, -1, -2, -3, -4);
|
wordsArray.set(0, -1, -2, -3, -4);
|
||||||
var segments = new ReversePreindexWordSegments(wordsArray, countsArray, null, null);
|
var segments = new FullPreindexWordSegments(wordsArray, countsArray, null, null);
|
||||||
|
|
||||||
var citr = segments.constructionIterator(1);
|
var citr = segments.constructionIterator(1);
|
||||||
assertEquals(-1, citr.wordId);
|
assertEquals(-1, citr.wordId);
|
@ -1,4 +1,4 @@
|
|||||||
package nu.marginalia.index.construction;
|
package nu.marginalia.index.construction.full;
|
||||||
|
|
||||||
import nu.marginalia.index.journal.model.IndexJournalEntryData;
|
import nu.marginalia.index.journal.model.IndexJournalEntryData;
|
||||||
import nu.marginalia.index.journal.model.IndexJournalEntryHeader;
|
import nu.marginalia.index.journal.model.IndexJournalEntryHeader;
|
@ -1,4 +1,4 @@
|
|||||||
package nu.marginalia.index.construction;
|
package nu.marginalia.index.construction.full;
|
||||||
|
|
||||||
import java.util.Arrays;
|
import java.util.Arrays;
|
||||||
|
|
@ -3,13 +3,11 @@ package nu.marginalia.index;
|
|||||||
import com.google.inject.Guice;
|
import com.google.inject.Guice;
|
||||||
import com.google.inject.Inject;
|
import com.google.inject.Inject;
|
||||||
import it.unimi.dsi.fastutil.ints.IntList;
|
import it.unimi.dsi.fastutil.ints.IntList;
|
||||||
import it.unimi.dsi.fastutil.longs.LongArrayList;
|
|
||||||
import it.unimi.dsi.fastutil.longs.LongList;
|
|
||||||
import nu.marginalia.IndexLocations;
|
import nu.marginalia.IndexLocations;
|
||||||
import nu.marginalia.array.page.LongQueryBuffer;
|
import nu.marginalia.array.page.LongQueryBuffer;
|
||||||
import nu.marginalia.hash.MurmurHash3_128;
|
import nu.marginalia.hash.MurmurHash3_128;
|
||||||
import nu.marginalia.index.construction.DocIdRewriter;
|
import nu.marginalia.index.construction.DocIdRewriter;
|
||||||
import nu.marginalia.index.construction.ReverseIndexConstructor;
|
import nu.marginalia.index.construction.full.FullIndexConstructor;
|
||||||
import nu.marginalia.index.domainrankings.DomainRankings;
|
import nu.marginalia.index.domainrankings.DomainRankings;
|
||||||
import nu.marginalia.index.forward.ForwardIndexConverter;
|
import nu.marginalia.index.forward.ForwardIndexConverter;
|
||||||
import nu.marginalia.index.forward.ForwardIndexFileNames;
|
import nu.marginalia.index.forward.ForwardIndexFileNames;
|
||||||
@ -33,7 +31,6 @@ import nu.marginalia.model.idx.WordMetadata;
|
|||||||
import nu.marginalia.process.control.FakeProcessHeartbeat;
|
import nu.marginalia.process.control.FakeProcessHeartbeat;
|
||||||
import nu.marginalia.process.control.ProcessHeartbeat;
|
import nu.marginalia.process.control.ProcessHeartbeat;
|
||||||
import nu.marginalia.sequence.GammaCodedSequence;
|
import nu.marginalia.sequence.GammaCodedSequence;
|
||||||
import nu.marginalia.service.control.ServiceHeartbeat;
|
|
||||||
import nu.marginalia.service.server.Initialization;
|
import nu.marginalia.service.server.Initialization;
|
||||||
import nu.marginalia.storage.FileStorageService;
|
import nu.marginalia.storage.FileStorageService;
|
||||||
import org.junit.jupiter.api.AfterEach;
|
import org.junit.jupiter.api.AfterEach;
|
||||||
@ -247,7 +244,7 @@ public class CombinedIndexReaderTest {
|
|||||||
if (!Files.isDirectory(tmpDir)) Files.createDirectories(tmpDir);
|
if (!Files.isDirectory(tmpDir)) Files.createDirectories(tmpDir);
|
||||||
|
|
||||||
var constructor =
|
var constructor =
|
||||||
new ReverseIndexConstructor(
|
new FullIndexConstructor(
|
||||||
outputFileDocs,
|
outputFileDocs,
|
||||||
outputFileWords,
|
outputFileWords,
|
||||||
outputFilePositions,
|
outputFilePositions,
|
||||||
@ -267,7 +264,7 @@ public class CombinedIndexReaderTest {
|
|||||||
|
|
||||||
if (!Files.isDirectory(tmpDir)) Files.createDirectories(tmpDir);
|
if (!Files.isDirectory(tmpDir)) Files.createDirectories(tmpDir);
|
||||||
|
|
||||||
var constructor = new ReverseIndexConstructor(
|
var constructor = new FullIndexConstructor(
|
||||||
outputFileDocs,
|
outputFileDocs,
|
||||||
outputFileWords,
|
outputFileWords,
|
||||||
outputFilePositions,
|
outputFilePositions,
|
||||||
|
@ -14,7 +14,7 @@ import nu.marginalia.process.control.ProcessHeartbeat;
|
|||||||
import nu.marginalia.sequence.GammaCodedSequence;
|
import nu.marginalia.sequence.GammaCodedSequence;
|
||||||
import nu.marginalia.storage.FileStorageService;
|
import nu.marginalia.storage.FileStorageService;
|
||||||
import nu.marginalia.index.construction.DocIdRewriter;
|
import nu.marginalia.index.construction.DocIdRewriter;
|
||||||
import nu.marginalia.index.construction.ReverseIndexConstructor;
|
import nu.marginalia.index.construction.full.FullIndexConstructor;
|
||||||
import nu.marginalia.index.forward.ForwardIndexConverter;
|
import nu.marginalia.index.forward.ForwardIndexConverter;
|
||||||
import nu.marginalia.index.forward.ForwardIndexFileNames;
|
import nu.marginalia.index.forward.ForwardIndexFileNames;
|
||||||
import nu.marginalia.index.journal.model.IndexJournalEntryHeader;
|
import nu.marginalia.index.journal.model.IndexJournalEntryHeader;
|
||||||
@ -291,7 +291,7 @@ public class IndexQueryServiceIntegrationSmokeTest {
|
|||||||
|
|
||||||
if (!Files.isDirectory(tmpDir)) Files.createDirectories(tmpDir);
|
if (!Files.isDirectory(tmpDir)) Files.createDirectories(tmpDir);
|
||||||
|
|
||||||
var constructor = new ReverseIndexConstructor(
|
var constructor = new FullIndexConstructor(
|
||||||
outputFileDocs,
|
outputFileDocs,
|
||||||
outputFileWords,
|
outputFileWords,
|
||||||
outputFilePositions,
|
outputFilePositions,
|
||||||
@ -313,7 +313,7 @@ public class IndexQueryServiceIntegrationSmokeTest {
|
|||||||
|
|
||||||
if (!Files.isDirectory(tmpDir)) Files.createDirectories(tmpDir);
|
if (!Files.isDirectory(tmpDir)) Files.createDirectories(tmpDir);
|
||||||
|
|
||||||
var constructor = new ReverseIndexConstructor(
|
var constructor = new FullIndexConstructor(
|
||||||
outputFileDocs,
|
outputFileDocs,
|
||||||
outputFileWords,
|
outputFileWords,
|
||||||
outputFilePositions,
|
outputFilePositions,
|
||||||
|
@ -7,13 +7,13 @@ import nu.marginalia.api.searchquery.model.query.SearchCoherenceConstraint;
|
|||||||
import nu.marginalia.api.searchquery.model.query.SearchSpecification;
|
import nu.marginalia.api.searchquery.model.query.SearchSpecification;
|
||||||
import nu.marginalia.api.searchquery.model.query.SearchQuery;
|
import nu.marginalia.api.searchquery.model.query.SearchQuery;
|
||||||
import nu.marginalia.api.searchquery.model.results.ResultRankingParameters;
|
import nu.marginalia.api.searchquery.model.results.ResultRankingParameters;
|
||||||
|
import nu.marginalia.index.construction.full.FullIndexConstructor;
|
||||||
import nu.marginalia.index.index.StatefulIndex;
|
import nu.marginalia.index.index.StatefulIndex;
|
||||||
import nu.marginalia.index.journal.model.IndexJournalEntryData;
|
import nu.marginalia.index.journal.model.IndexJournalEntryData;
|
||||||
import nu.marginalia.sequence.GammaCodedSequence;
|
import nu.marginalia.sequence.GammaCodedSequence;
|
||||||
import nu.marginalia.storage.FileStorageService;
|
import nu.marginalia.storage.FileStorageService;
|
||||||
import nu.marginalia.hash.MurmurHash3_128;
|
import nu.marginalia.hash.MurmurHash3_128;
|
||||||
import nu.marginalia.index.construction.DocIdRewriter;
|
import nu.marginalia.index.construction.DocIdRewriter;
|
||||||
import nu.marginalia.index.construction.ReverseIndexConstructor;
|
|
||||||
import nu.marginalia.index.forward.ForwardIndexConverter;
|
import nu.marginalia.index.forward.ForwardIndexConverter;
|
||||||
import nu.marginalia.index.forward.ForwardIndexFileNames;
|
import nu.marginalia.index.forward.ForwardIndexFileNames;
|
||||||
import nu.marginalia.index.journal.model.IndexJournalEntryHeader;
|
import nu.marginalia.index.journal.model.IndexJournalEntryHeader;
|
||||||
@ -493,7 +493,7 @@ public class IndexQueryServiceIntegrationTest {
|
|||||||
if (!Files.isDirectory(tmpDir)) Files.createDirectories(tmpDir);
|
if (!Files.isDirectory(tmpDir)) Files.createDirectories(tmpDir);
|
||||||
|
|
||||||
var constructor =
|
var constructor =
|
||||||
new ReverseIndexConstructor(
|
new FullIndexConstructor(
|
||||||
outputFileDocs,
|
outputFileDocs,
|
||||||
outputFileWords,
|
outputFileWords,
|
||||||
outputFilePositions,
|
outputFilePositions,
|
||||||
@ -513,7 +513,7 @@ public class IndexQueryServiceIntegrationTest {
|
|||||||
|
|
||||||
if (!Files.isDirectory(tmpDir)) Files.createDirectories(tmpDir);
|
if (!Files.isDirectory(tmpDir)) Files.createDirectories(tmpDir);
|
||||||
|
|
||||||
var constructor = new ReverseIndexConstructor(
|
var constructor = new FullIndexConstructor(
|
||||||
outputFileDocs,
|
outputFileDocs,
|
||||||
outputFileWords,
|
outputFileWords,
|
||||||
outputFilePositions,
|
outputFilePositions,
|
||||||
|
@ -6,10 +6,11 @@ import com.google.inject.Inject;
|
|||||||
import nu.marginalia.IndexLocations;
|
import nu.marginalia.IndexLocations;
|
||||||
import nu.marginalia.ProcessConfiguration;
|
import nu.marginalia.ProcessConfiguration;
|
||||||
import nu.marginalia.ProcessConfigurationModule;
|
import nu.marginalia.ProcessConfigurationModule;
|
||||||
|
import nu.marginalia.index.construction.prio.PrioIndexConstructor;
|
||||||
import nu.marginalia.index.domainrankings.DomainRankings;
|
import nu.marginalia.index.domainrankings.DomainRankings;
|
||||||
import nu.marginalia.service.ProcessMainClass;
|
import nu.marginalia.service.ProcessMainClass;
|
||||||
import nu.marginalia.storage.FileStorageService;
|
import nu.marginalia.storage.FileStorageService;
|
||||||
import nu.marginalia.index.construction.ReverseIndexConstructor;
|
import nu.marginalia.index.construction.full.FullIndexConstructor;
|
||||||
import nu.marginalia.index.forward.ForwardIndexConverter;
|
import nu.marginalia.index.forward.ForwardIndexConverter;
|
||||||
import nu.marginalia.index.forward.ForwardIndexFileNames;
|
import nu.marginalia.index.forward.ForwardIndexFileNames;
|
||||||
import nu.marginalia.index.journal.reader.IndexJournalReader;
|
import nu.marginalia.index.journal.reader.IndexJournalReader;
|
||||||
@ -117,7 +118,7 @@ public class IndexConstructorMain extends ProcessMainClass {
|
|||||||
|
|
||||||
if (!Files.isDirectory(tmpDir)) Files.createDirectories(tmpDir);
|
if (!Files.isDirectory(tmpDir)) Files.createDirectories(tmpDir);
|
||||||
|
|
||||||
var constructor = new ReverseIndexConstructor(
|
var constructor = new FullIndexConstructor(
|
||||||
outputFileDocs,
|
outputFileDocs,
|
||||||
outputFileWords,
|
outputFileWords,
|
||||||
outputFilePositions,
|
outputFilePositions,
|
||||||
@ -142,7 +143,7 @@ public class IndexConstructorMain extends ProcessMainClass {
|
|||||||
// important to the document. This filter will act on the encoded {@see WordMetadata}
|
// important to the document. This filter will act on the encoded {@see WordMetadata}
|
||||||
LongPredicate wordMetaFilter = getPriorityIndexWordMetaFilter();
|
LongPredicate wordMetaFilter = getPriorityIndexWordMetaFilter();
|
||||||
|
|
||||||
var constructor = new ReverseIndexConstructor(
|
var constructor = new PrioIndexConstructor(
|
||||||
outputFileDocs,
|
outputFileDocs,
|
||||||
outputFileWords,
|
outputFileWords,
|
||||||
outputFilePositions,
|
outputFilePositions,
|
||||||
|
@ -17,7 +17,7 @@ import nu.marginalia.functions.searchquery.QueryFactory;
|
|||||||
import nu.marginalia.index.IndexGrpcService;
|
import nu.marginalia.index.IndexGrpcService;
|
||||||
import nu.marginalia.index.ReverseIndexFullFileNames;
|
import nu.marginalia.index.ReverseIndexFullFileNames;
|
||||||
import nu.marginalia.index.ReverseIndexPrioFileNames;
|
import nu.marginalia.index.ReverseIndexPrioFileNames;
|
||||||
import nu.marginalia.index.construction.ReverseIndexConstructor;
|
import nu.marginalia.index.construction.full.FullIndexConstructor;
|
||||||
import nu.marginalia.index.domainrankings.DomainRankings;
|
import nu.marginalia.index.domainrankings.DomainRankings;
|
||||||
import nu.marginalia.index.forward.ForwardIndexConverter;
|
import nu.marginalia.index.forward.ForwardIndexConverter;
|
||||||
import nu.marginalia.index.forward.ForwardIndexFileNames;
|
import nu.marginalia.index.forward.ForwardIndexFileNames;
|
||||||
@ -244,7 +244,7 @@ public class IntegrationTest {
|
|||||||
|
|
||||||
if (!Files.isDirectory(tmpDir)) Files.createDirectories(tmpDir);
|
if (!Files.isDirectory(tmpDir)) Files.createDirectories(tmpDir);
|
||||||
|
|
||||||
var constructor = new ReverseIndexConstructor(
|
var constructor = new FullIndexConstructor(
|
||||||
outputFileDocs,
|
outputFileDocs,
|
||||||
outputFileWords,
|
outputFileWords,
|
||||||
outputFilePositions,
|
outputFilePositions,
|
||||||
@ -269,7 +269,7 @@ public class IntegrationTest {
|
|||||||
// important to the document. This filter will act on the encoded {@see WordMetadata}
|
// important to the document. This filter will act on the encoded {@see WordMetadata}
|
||||||
LongPredicate wordMetaFilter = getPriorityIndexWordMetaFilter();
|
LongPredicate wordMetaFilter = getPriorityIndexWordMetaFilter();
|
||||||
|
|
||||||
var constructor = new ReverseIndexConstructor(
|
var constructor = new FullIndexConstructor(
|
||||||
outputFileDocs,
|
outputFileDocs,
|
||||||
outputFileWords,
|
outputFileWords,
|
||||||
outputFilePositions,
|
outputFilePositions,
|
||||||
|
Loading…
Reference in New Issue
Block a user