diff --git a/code/index/index-reverse/java/nu/marginalia/index/construction/ReverseIndexBTreeTransformer.java b/code/index/index-reverse/java/nu/marginalia/index/construction/full/FullIndexBTreeTransformer.java similarity index 73% rename from code/index/index-reverse/java/nu/marginalia/index/construction/ReverseIndexBTreeTransformer.java rename to code/index/index-reverse/java/nu/marginalia/index/construction/full/FullIndexBTreeTransformer.java index dd5499bf..ccf21331 100644 --- a/code/index/index-reverse/java/nu/marginalia/index/construction/ReverseIndexBTreeTransformer.java +++ b/code/index/index-reverse/java/nu/marginalia/index/construction/full/FullIndexBTreeTransformer.java @@ -1,4 +1,4 @@ -package nu.marginalia.index.construction; +package nu.marginalia.index.construction.full; import nu.marginalia.array.LongArray; import nu.marginalia.array.algo.LongArrayTransformations; @@ -9,7 +9,7 @@ import java.io.IOException; import java.nio.channels.FileChannel; /** Constructs the BTrees in a reverse index */ -public class ReverseIndexBTreeTransformer implements LongArrayTransformations.LongIOTransformer { +public class FullIndexBTreeTransformer implements LongArrayTransformations.LongIOTransformer { private final BTreeWriter writer; private final FileChannel intermediateChannel; @@ -18,10 +18,10 @@ public class ReverseIndexBTreeTransformer implements LongArrayTransformations.Lo long start = 0; long writeOffset = 0; - public ReverseIndexBTreeTransformer(LongArray urlsFileMap, - int entrySize, - BTreeContext bTreeContext, - FileChannel intermediateChannel) { + public FullIndexBTreeTransformer(LongArray urlsFileMap, + int entrySize, + BTreeContext bTreeContext, + FileChannel intermediateChannel) { this.writer = new BTreeWriter(urlsFileMap, bTreeContext); this.entrySize = entrySize; this.intermediateChannel = intermediateChannel; diff --git a/code/index/index-reverse/java/nu/marginalia/index/construction/ReverseIndexConstructor.java b/code/index/index-reverse/java/nu/marginalia/index/construction/full/FullIndexConstructor.java similarity index 74% rename from code/index/index-reverse/java/nu/marginalia/index/construction/ReverseIndexConstructor.java rename to code/index/index-reverse/java/nu/marginalia/index/construction/full/FullIndexConstructor.java index 9fa3ed93..db7d5604 100644 --- a/code/index/index-reverse/java/nu/marginalia/index/construction/ReverseIndexConstructor.java +++ b/code/index/index-reverse/java/nu/marginalia/index/construction/full/FullIndexConstructor.java @@ -1,6 +1,9 @@ -package nu.marginalia.index.construction; +package nu.marginalia.index.construction.full; import lombok.SneakyThrows; +import nu.marginalia.index.construction.DocIdRewriter; +import nu.marginalia.index.construction.JournalReaderSource; +import nu.marginalia.index.construction.PositionsFileConstructor; import nu.marginalia.process.control.ProcessHeartbeat; import nu.marginalia.index.journal.IndexJournalFileNames; import org.slf4j.Logger; @@ -10,9 +13,9 @@ import java.io.IOException; import java.nio.file.Path; import java.util.concurrent.atomic.AtomicInteger; -public class ReverseIndexConstructor { +public class FullIndexConstructor { - private static final Logger logger = LoggerFactory.getLogger(ReverseIndexConstructor.class); + private static final Logger logger = LoggerFactory.getLogger(FullIndexConstructor.class); public enum CreateReverseIndexSteps { CONSTRUCT, @@ -27,12 +30,12 @@ public class ReverseIndexConstructor { private final DocIdRewriter docIdRewriter; private final Path tmpDir; - public ReverseIndexConstructor(Path outputFileDocs, - Path outputFileWords, - Path outputFilePositions, - JournalReaderSource readerSource, - DocIdRewriter docIdRewriter, - Path tmpDir) { + public FullIndexConstructor(Path outputFileDocs, + Path outputFileWords, + Path outputFilePositions, + JournalReaderSource readerSource, + DocIdRewriter docIdRewriter, + Path tmpDir) { this.outputFileDocs = outputFileDocs; this.outputFileWords = outputFileWords; this.outputFilePositions = outputFilePositions; @@ -77,20 +80,20 @@ public class ReverseIndexConstructor { } @SneakyThrows - private ReversePreindexReference construct(Path input, PositionsFileConstructor positionsFileConstructor) { - return ReversePreindex + private FullPreindexReference construct(Path input, PositionsFileConstructor positionsFileConstructor) { + return FullPreindex .constructPreindex(readerSource.construct(input), positionsFileConstructor, docIdRewriter, tmpDir) .closeToReference(); } @SneakyThrows - private ReversePreindexReference merge(ReversePreindexReference leftR, ReversePreindexReference rightR) { + private FullPreindexReference merge(FullPreindexReference leftR, FullPreindexReference rightR) { var left = leftR.open(); var right = rightR.open(); try { - return ReversePreindex.merge(tmpDir, left, right).closeToReference(); + return FullPreindex.merge(tmpDir, left, right).closeToReference(); } finally { left.delete(); @@ -101,7 +104,7 @@ public class ReverseIndexConstructor { } @SneakyThrows - private void finalizeIndex(ReversePreindexReference finalPR) { + private void finalizeIndex(FullPreindexReference finalPR) { var finalP = finalPR.open(); finalP.finalizeIndex(outputFileDocs, outputFileWords); finalP.delete(); diff --git a/code/index/index-reverse/java/nu/marginalia/index/construction/ReversePreindex.java b/code/index/index-reverse/java/nu/marginalia/index/construction/full/FullPreindex.java similarity index 79% rename from code/index/index-reverse/java/nu/marginalia/index/construction/ReversePreindex.java rename to code/index/index-reverse/java/nu/marginalia/index/construction/full/FullPreindex.java index 3abe8171..668263d8 100644 --- a/code/index/index-reverse/java/nu/marginalia/index/construction/ReversePreindex.java +++ b/code/index/index-reverse/java/nu/marginalia/index/construction/full/FullPreindex.java @@ -1,9 +1,13 @@ -package nu.marginalia.index.construction; +package nu.marginalia.index.construction.full; import nu.marginalia.array.LongArray; import nu.marginalia.array.LongArrayFactory; import nu.marginalia.btree.BTreeWriter; import nu.marginalia.index.ReverseIndexParameters; +import nu.marginalia.index.construction.CountToOffsetTransformer; +import nu.marginalia.index.construction.DocIdRewriter; +import nu.marginalia.index.construction.IndexSizeEstimator; +import nu.marginalia.index.construction.PositionsFileConstructor; import nu.marginalia.index.journal.reader.IndexJournalReader; import org.slf4j.Logger; import org.slf4j.LoggerFactory; @@ -25,13 +29,13 @@ import static nu.marginalia.array.algo.TwoArrayOperations.*; * the union of their data. This operation requires no additional * RAM. */ -public class ReversePreindex { - final ReversePreindexWordSegments segments; - final ReversePreindexDocuments documents; +public class FullPreindex { + final FullPreindexWordSegments segments; + final FullPreindexDocuments documents; - private static final Logger logger = LoggerFactory.getLogger(ReversePreindex.class); + private static final Logger logger = LoggerFactory.getLogger(FullPreindex.class); - public ReversePreindex(ReversePreindexWordSegments segments, ReversePreindexDocuments documents) { + public FullPreindex(FullPreindexWordSegments segments, FullPreindexDocuments documents) { this.segments = segments; this.documents = documents; } @@ -39,27 +43,27 @@ public class ReversePreindex { /** Constructs a new preindex with the data associated with reader. The backing files * will have randomly assigned names. */ - public static ReversePreindex constructPreindex(IndexJournalReader reader, - PositionsFileConstructor positionsFileConstructor, - DocIdRewriter docIdRewriter, - Path workDir) throws IOException + public static FullPreindex constructPreindex(IndexJournalReader reader, + PositionsFileConstructor positionsFileConstructor, + DocIdRewriter docIdRewriter, + Path workDir) throws IOException { Path segmentWordsFile = Files.createTempFile(workDir, "segment_words", ".dat"); Path segmentCountsFile = Files.createTempFile(workDir, "segment_counts", ".dat"); Path docsFile = Files.createTempFile(workDir, "docs", ".dat"); - var segments = ReversePreindexWordSegments.construct(reader, segmentWordsFile, segmentCountsFile); - var docs = ReversePreindexDocuments.construct(docsFile, workDir, reader, docIdRewriter, positionsFileConstructor, segments); - return new ReversePreindex(segments, docs); + var segments = FullPreindexWordSegments.construct(reader, segmentWordsFile, segmentCountsFile); + var docs = FullPreindexDocuments.construct(docsFile, workDir, reader, docIdRewriter, positionsFileConstructor, segments); + return new FullPreindex(segments, docs); } /** Close the associated memory mapped areas and return * a dehydrated version of this object that can be re-opened * later. */ - public ReversePreindexReference closeToReference() { + public FullPreindexReference closeToReference() { try { - return new ReversePreindexReference(segments, documents); + return new FullPreindexReference(segments, documents); } finally { segments.force(); @@ -85,7 +89,7 @@ public class ReversePreindex { LongArray finalDocs = LongArrayFactory.mmapForWritingConfined(outputFileDocs, sizeEstimator.size); try (var intermediateDocChannel = documents.createDocumentsFileChannel()) { offsets.transformEachIO(0, offsets.size(), - new ReverseIndexBTreeTransformer(finalDocs, 2, + new FullIndexBTreeTransformer(finalDocs, 2, ReverseIndexParameters.docsBTreeContext, intermediateDocChannel)); intermediateDocChannel.force(false); @@ -126,11 +130,11 @@ public class ReversePreindex { documents.delete(); } - public static ReversePreindex merge(Path destDir, - ReversePreindex left, - ReversePreindex right) throws IOException { + public static FullPreindex merge(Path destDir, + FullPreindex left, + FullPreindex right) throws IOException { - ReversePreindexWordSegments mergingSegment = + FullPreindexWordSegments mergingSegment = createMergedSegmentWordFile(destDir, left.segments, right.segments); var mergingIter = mergingSegment.constructionIterator(2); @@ -198,18 +202,18 @@ public class ReversePreindex { mergedDocuments = shrinkMergedDocuments(mergedDocuments, docsFile, 2 * mergingSegment.totalSize()); - return new ReversePreindex( + return new FullPreindex( mergingSegment, - new ReversePreindexDocuments(mergedDocuments, docsFile) + new FullPreindexDocuments(mergedDocuments, docsFile) ); } /** Create a segment word file with each word from both inputs, with zero counts for all the data. * This is an intermediate product in merging. */ - static ReversePreindexWordSegments createMergedSegmentWordFile(Path destDir, - ReversePreindexWordSegments left, - ReversePreindexWordSegments right) throws IOException { + static FullPreindexWordSegments createMergedSegmentWordFile(Path destDir, + FullPreindexWordSegments left, + FullPreindexWordSegments right) throws IOException { Path segmentWordsFile = Files.createTempFile(destDir, "segment_words", ".dat"); Path segmentCountsFile = Files.createTempFile(destDir, "segment_counts", ".dat"); @@ -228,7 +232,7 @@ public class ReversePreindex { LongArray counts = LongArrayFactory.mmapForWritingConfined(segmentCountsFile, segmentsSize); - return new ReversePreindexWordSegments(wordIdsFile, counts, segmentWordsFile, segmentCountsFile); + return new FullPreindexWordSegments(wordIdsFile, counts, segmentWordsFile, segmentCountsFile); } /** It's possible we overestimated the necessary size of the documents file, @@ -256,12 +260,12 @@ public class ReversePreindex { /** Merge contents of the segments indicated by leftIter and rightIter into the destionation * segment, and advance the construction iterator with the appropriate size. */ - private static void mergeSegments(ReversePreindexWordSegments.SegmentIterator leftIter, - ReversePreindexWordSegments.SegmentIterator rightIter, - ReversePreindexDocuments left, - ReversePreindexDocuments right, + private static void mergeSegments(FullPreindexWordSegments.SegmentIterator leftIter, + FullPreindexWordSegments.SegmentIterator rightIter, + FullPreindexDocuments left, + FullPreindexDocuments right, LongArray dest, - ReversePreindexWordSegments.SegmentConstructionIterator destIter) + FullPreindexWordSegments.SegmentConstructionIterator destIter) { long segSize = mergeArrays2(dest, left.documents, @@ -279,10 +283,10 @@ public class ReversePreindex { /** Copy the data from the source segment at the position and length indicated by sourceIter, * into the destination segment, and advance the construction iterator. */ - private static boolean copySegment(ReversePreindexWordSegments.SegmentIterator sourceIter, - LongArray dest, - FileChannel sourceChannel, - ReversePreindexWordSegments.SegmentConstructionIterator mergingIter) throws IOException { + private static boolean copySegment(FullPreindexWordSegments.SegmentIterator sourceIter, + LongArray dest, + FileChannel sourceChannel, + FullPreindexWordSegments.SegmentConstructionIterator mergingIter) throws IOException { long size = sourceIter.endOffset - sourceIter.startOffset; long start = mergingIter.startOffset; diff --git a/code/index/index-reverse/java/nu/marginalia/index/construction/ReversePreindexDocuments.java b/code/index/index-reverse/java/nu/marginalia/index/construction/full/FullPreindexDocuments.java similarity index 84% rename from code/index/index-reverse/java/nu/marginalia/index/construction/ReversePreindexDocuments.java rename to code/index/index-reverse/java/nu/marginalia/index/construction/full/FullPreindexDocuments.java index d0d5ed7e..49442367 100644 --- a/code/index/index-reverse/java/nu/marginalia/index/construction/ReversePreindexDocuments.java +++ b/code/index/index-reverse/java/nu/marginalia/index/construction/full/FullPreindexDocuments.java @@ -1,8 +1,10 @@ -package nu.marginalia.index.construction; +package nu.marginalia.index.construction.full; import lombok.SneakyThrows; import nu.marginalia.array.LongArray; import nu.marginalia.array.LongArrayFactory; +import nu.marginalia.index.construction.DocIdRewriter; +import nu.marginalia.index.construction.PositionsFileConstructor; import nu.marginalia.index.journal.reader.IndexJournalReader; import nu.marginalia.rwf.RandomFileAssembler; import org.slf4j.Logger; @@ -20,35 +22,35 @@ import java.util.concurrent.TimeUnit; /** A LongArray with document data, segmented according to * the associated ReversePreindexWordSegments data */ -public class ReversePreindexDocuments { +public class FullPreindexDocuments { public final LongArray documents; private static PositionsFileConstructor positionsFileConstructor; private static final int RECORD_SIZE_LONGS = 2; - private static final Logger logger = LoggerFactory.getLogger(ReversePreindexDocuments.class); + private static final Logger logger = LoggerFactory.getLogger(FullPreindexDocuments.class); public final Path file; - public ReversePreindexDocuments(LongArray documents, Path file) { + public FullPreindexDocuments(LongArray documents, Path file) { this.documents = documents; this.file = file; } - public static ReversePreindexDocuments construct( + public static FullPreindexDocuments construct( Path docsFile, Path workDir, IndexJournalReader reader, DocIdRewriter docIdRewriter, PositionsFileConstructor positionsFileConstructor, - ReversePreindexWordSegments segments) throws IOException { - ReversePreindexDocuments.positionsFileConstructor = positionsFileConstructor; + FullPreindexWordSegments segments) throws IOException { + FullPreindexDocuments.positionsFileConstructor = positionsFileConstructor; createUnsortedDocsFile(docsFile, workDir, reader, segments, docIdRewriter); LongArray docsFileMap = LongArrayFactory.mmapForModifyingShared(docsFile); sortDocsFile(docsFileMap, segments); - return new ReversePreindexDocuments(docsFileMap, docsFile); + return new FullPreindexDocuments(docsFileMap, docsFile); } public FileChannel createDocumentsFileChannel() throws IOException { @@ -67,7 +69,7 @@ public class ReversePreindexDocuments { private static void createUnsortedDocsFile(Path docsFile, Path workDir, IndexJournalReader reader, - ReversePreindexWordSegments segments, + FullPreindexWordSegments segments, DocIdRewriter docIdRewriter) throws IOException { long fileSizeLongs = RECORD_SIZE_LONGS * segments.totalSize(); @@ -99,7 +101,7 @@ public class ReversePreindexDocuments { } @SneakyThrows - private static void sortDocsFile(LongArray docsFileMap, ReversePreindexWordSegments segments) throws IOException { + private static void sortDocsFile(LongArray docsFileMap, FullPreindexWordSegments segments) throws IOException { var iter = segments.iterator(RECORD_SIZE_LONGS); diff --git a/code/index/index-reverse/java/nu/marginalia/index/construction/ReversePreindexReference.java b/code/index/index-reverse/java/nu/marginalia/index/construction/full/FullPreindexReference.java similarity index 62% rename from code/index/index-reverse/java/nu/marginalia/index/construction/ReversePreindexReference.java rename to code/index/index-reverse/java/nu/marginalia/index/construction/full/FullPreindexReference.java index 16c542d5..9045b0c7 100644 --- a/code/index/index-reverse/java/nu/marginalia/index/construction/ReversePreindexReference.java +++ b/code/index/index-reverse/java/nu/marginalia/index/construction/full/FullPreindexReference.java @@ -1,33 +1,33 @@ -package nu.marginalia.index.construction; +package nu.marginalia.index.construction.full; import nu.marginalia.array.LongArrayFactory; import java.io.IOException; import java.nio.file.Path; -/** This is a dehydrated version of a ReversePreIndex, that only +/** This is a dehydrated version of a FullPreIndex, that only * keeps references to its location on disk but does not hold associated * memory maps. */ -public record ReversePreindexReference( +public record FullPreindexReference( Path wordsFile, Path countsFile, Path documentsFile ) { - public ReversePreindexReference(ReversePreindexWordSegments segments, ReversePreindexDocuments documents) { + public FullPreindexReference(FullPreindexWordSegments segments, FullPreindexDocuments documents) { this(segments.wordsFile, segments.countsFile, documents.file); } - public ReversePreindex open() throws IOException { - return new ReversePreindex( - new ReversePreindexWordSegments( + public FullPreindex open() throws IOException { + return new FullPreindex( + new FullPreindexWordSegments( LongArrayFactory.mmapForModifyingShared(wordsFile), LongArrayFactory.mmapForModifyingShared(countsFile), wordsFile, countsFile ), - new ReversePreindexDocuments( + new FullPreindexDocuments( LongArrayFactory.mmapForModifyingShared(documentsFile), documentsFile ) diff --git a/code/index/index-reverse/java/nu/marginalia/index/construction/ReversePreindexWordSegments.java b/code/index/index-reverse/java/nu/marginalia/index/construction/full/FullPreindexWordSegments.java similarity index 89% rename from code/index/index-reverse/java/nu/marginalia/index/construction/ReversePreindexWordSegments.java rename to code/index/index-reverse/java/nu/marginalia/index/construction/full/FullPreindexWordSegments.java index 0351ed45..eb744616 100644 --- a/code/index/index-reverse/java/nu/marginalia/index/construction/ReversePreindexWordSegments.java +++ b/code/index/index-reverse/java/nu/marginalia/index/construction/full/FullPreindexWordSegments.java @@ -1,4 +1,4 @@ -package nu.marginalia.index.construction; +package nu.marginalia.index.construction.full; import it.unimi.dsi.fastutil.longs.Long2IntOpenHashMap; import it.unimi.dsi.fastutil.longs.Long2LongOpenHashMap; @@ -14,17 +14,17 @@ import java.nio.file.Path; /** A pair of file-backed arrays of sorted wordIds * and the count of documents associated with each termId. */ -public class ReversePreindexWordSegments { +public class FullPreindexWordSegments { public final LongArray wordIds; public final LongArray counts; final Path wordsFile; final Path countsFile; - public ReversePreindexWordSegments(LongArray wordIds, - LongArray counts, - Path wordsFile, - Path countsFile) + public FullPreindexWordSegments(LongArray wordIds, + LongArray counts, + Path wordsFile, + Path countsFile) { assert wordIds.size() == counts.size(); @@ -51,9 +51,9 @@ public class ReversePreindexWordSegments { return ret; } - public static ReversePreindexWordSegments construct(IndexJournalReader reader, - Path wordIdsFile, - Path countsFile) + public static FullPreindexWordSegments construct(IndexJournalReader reader, + Path wordIdsFile, + Path countsFile) throws IOException { Long2IntOpenHashMap countsMap = new Long2IntOpenHashMap(100_000, 0.75f); @@ -79,7 +79,7 @@ public class ReversePreindexWordSegments { counts.set(i, countsMap.get(words.get(i))); } - return new ReversePreindexWordSegments(words, counts, wordIdsFile, countsFile); + return new FullPreindexWordSegments(words, counts, wordIdsFile, countsFile); } public SegmentIterator iterator(int recordSize) { diff --git a/code/index/index-reverse/java/nu/marginalia/index/construction/prio/PrioIndexBTreeTransformer.java b/code/index/index-reverse/java/nu/marginalia/index/construction/prio/PrioIndexBTreeTransformer.java new file mode 100644 index 00000000..d402405a --- /dev/null +++ b/code/index/index-reverse/java/nu/marginalia/index/construction/prio/PrioIndexBTreeTransformer.java @@ -0,0 +1,48 @@ +package nu.marginalia.index.construction.prio; + +import nu.marginalia.array.LongArray; +import nu.marginalia.array.algo.LongArrayTransformations; +import nu.marginalia.btree.BTreeWriter; +import nu.marginalia.btree.model.BTreeContext; + +import java.io.IOException; +import java.nio.channels.FileChannel; + +/** Constructs the BTrees in a reverse index */ +public class PrioIndexBTreeTransformer implements LongArrayTransformations.LongIOTransformer { + private final BTreeWriter writer; + private final FileChannel intermediateChannel; + + private final int entrySize; + + long start = 0; + long writeOffset = 0; + + public PrioIndexBTreeTransformer(LongArray urlsFileMap, + int entrySize, + BTreeContext bTreeContext, + FileChannel intermediateChannel) { + this.writer = new BTreeWriter(urlsFileMap, bTreeContext); + this.entrySize = entrySize; + this.intermediateChannel = intermediateChannel; + } + + @Override + public long transform(long pos, long end) throws IOException { + + final int size = (int) ((end - start) / entrySize); + + if (size == 0) { + return -1; + } + + final long offsetForBlock = writeOffset; + + writeOffset += writer.write(writeOffset, size, + mapRegion -> mapRegion.transferFrom(intermediateChannel, start, 0, end - start) + ); + + start = end; + return offsetForBlock; + } +} diff --git a/code/index/index-reverse/java/nu/marginalia/index/construction/prio/PrioIndexConstructor.java b/code/index/index-reverse/java/nu/marginalia/index/construction/prio/PrioIndexConstructor.java new file mode 100644 index 00000000..4cad80b9 --- /dev/null +++ b/code/index/index-reverse/java/nu/marginalia/index/construction/prio/PrioIndexConstructor.java @@ -0,0 +1,114 @@ +package nu.marginalia.index.construction.prio; + +import lombok.SneakyThrows; +import nu.marginalia.index.construction.DocIdRewriter; +import nu.marginalia.index.construction.JournalReaderSource; +import nu.marginalia.index.construction.PositionsFileConstructor; +import nu.marginalia.index.journal.IndexJournalFileNames; +import nu.marginalia.process.control.ProcessHeartbeat; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; + +import java.io.IOException; +import java.nio.file.Path; +import java.util.concurrent.atomic.AtomicInteger; + +public class PrioIndexConstructor { + + private static final Logger logger = LoggerFactory.getLogger(PrioIndexConstructor.class); + + public enum CreateReverseIndexSteps { + CONSTRUCT, + FINALIZE, + FINISHED + } + + private final Path outputFileDocs; + private final Path outputFileWords; + private final Path outputFilePositions; + private final JournalReaderSource readerSource; + private final DocIdRewriter docIdRewriter; + private final Path tmpDir; + + public PrioIndexConstructor(Path outputFileDocs, + Path outputFileWords, + Path outputFilePositions, + JournalReaderSource readerSource, + DocIdRewriter docIdRewriter, + Path tmpDir) { + this.outputFileDocs = outputFileDocs; + this.outputFileWords = outputFileWords; + this.outputFilePositions = outputFilePositions; + this.readerSource = readerSource; + this.docIdRewriter = docIdRewriter; + this.tmpDir = tmpDir; + } + + public void createReverseIndex(ProcessHeartbeat processHeartbeat, + String processName, + Path sourceBaseDir) throws IOException + { + var inputs = IndexJournalFileNames.findJournalFiles(sourceBaseDir); + if (inputs.isEmpty()) { + logger.error("No journal files in base dir {}", sourceBaseDir); + return; + } + + try (var heartbeat = processHeartbeat.createProcessTaskHeartbeat(CreateReverseIndexSteps.class, processName); + var preindexHeartbeat = processHeartbeat.createAdHocTaskHeartbeat("constructPreindexes"); + var posConstructor = new PositionsFileConstructor(outputFilePositions) + ) { + heartbeat.progress(CreateReverseIndexSteps.CONSTRUCT); + + AtomicInteger progress = new AtomicInteger(0); + + inputs + .parallelStream() + .map(in -> { + preindexHeartbeat.progress("PREINDEX/MERGE", progress.incrementAndGet(), inputs.size()); + return construct(in, posConstructor); + }) + .reduce(this::merge) + .ifPresent((index) -> { + heartbeat.progress(CreateReverseIndexSteps.FINALIZE); + finalizeIndex(index); + heartbeat.progress(CreateReverseIndexSteps.FINISHED); + }); + + heartbeat.progress(CreateReverseIndexSteps.FINISHED); + } + } + + @SneakyThrows + private PrioPreindexReference construct(Path input, PositionsFileConstructor positionsFileConstructor) { + return PrioPreindex + .constructPreindex(readerSource.construct(input), positionsFileConstructor, docIdRewriter, tmpDir) + .closeToReference(); + } + + @SneakyThrows + private PrioPreindexReference merge(PrioPreindexReference leftR, PrioPreindexReference rightR) { + + var left = leftR.open(); + var right = rightR.open(); + + try { + return PrioPreindex.merge(tmpDir, left, right).closeToReference(); + } + finally { + left.delete(); + right.delete(); + } + + + } + + @SneakyThrows + private void finalizeIndex(PrioPreindexReference finalPR) { + var finalP = finalPR.open(); + finalP.finalizeIndex(outputFileDocs, outputFileWords); + finalP.delete(); + } + + +} diff --git a/code/index/index-reverse/java/nu/marginalia/index/construction/prio/PrioPreindex.java b/code/index/index-reverse/java/nu/marginalia/index/construction/prio/PrioPreindex.java new file mode 100644 index 00000000..f5449231 --- /dev/null +++ b/code/index/index-reverse/java/nu/marginalia/index/construction/prio/PrioPreindex.java @@ -0,0 +1,310 @@ +package nu.marginalia.index.construction.prio; + +import nu.marginalia.array.LongArray; +import nu.marginalia.array.LongArrayFactory; +import nu.marginalia.btree.BTreeWriter; +import nu.marginalia.index.ReverseIndexParameters; +import nu.marginalia.index.construction.CountToOffsetTransformer; +import nu.marginalia.index.construction.DocIdRewriter; +import nu.marginalia.index.construction.IndexSizeEstimator; +import nu.marginalia.index.construction.PositionsFileConstructor; +import nu.marginalia.index.journal.reader.IndexJournalReader; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; + +import java.io.IOException; +import java.nio.channels.FileChannel; +import java.nio.file.Files; +import java.nio.file.Path; +import java.nio.file.StandardOpenOption; + +import static nu.marginalia.array.algo.TwoArrayOperations.*; + +/** Contains the data that would go into a reverse index, + * that is, a mapping from words to documents, minus the actual + * index structure that makes the data quick to access while + * searching. + *
+ * Two preindexes can be merged into a third preindex containing
+ * the union of their data. This operation requires no additional
+ * RAM.
+ */
+public class PrioPreindex {
+ final PrioPreindexWordSegments segments;
+ final PrioPreindexDocuments documents;
+
+ private static final Logger logger = LoggerFactory.getLogger(PrioPreindex.class);
+
+ public PrioPreindex(PrioPreindexWordSegments segments, PrioPreindexDocuments documents) {
+ this.segments = segments;
+ this.documents = documents;
+ }
+
+ /** Constructs a new preindex with the data associated with reader. The backing files
+ * will have randomly assigned names.
+ */
+ public static PrioPreindex constructPreindex(IndexJournalReader reader,
+ PositionsFileConstructor positionsFileConstructor,
+ DocIdRewriter docIdRewriter,
+ Path workDir) throws IOException
+ {
+ Path segmentWordsFile = Files.createTempFile(workDir, "segment_words", ".dat");
+ Path segmentCountsFile = Files.createTempFile(workDir, "segment_counts", ".dat");
+ Path docsFile = Files.createTempFile(workDir, "docs", ".dat");
+
+ var segments = PrioPreindexWordSegments.construct(reader, segmentWordsFile, segmentCountsFile);
+ var docs = PrioPreindexDocuments.construct(docsFile, workDir, reader, docIdRewriter, positionsFileConstructor, segments);
+ return new PrioPreindex(segments, docs);
+ }
+
+ /** Close the associated memory mapped areas and return
+ * a dehydrated version of this object that can be re-opened
+ * later.
+ */
+ public PrioPreindexReference closeToReference() {
+ try {
+ return new PrioPreindexReference(segments, documents);
+ }
+ finally {
+ segments.force();
+ documents.force();
+ segments.close();
+ documents.close();
+ }
+ }
+
+ /** Transform the preindex into a reverse index */
+ public void finalizeIndex(Path outputFileDocs, Path outputFileWords) throws IOException {
+ var offsets = segments.counts;
+
+ Files.deleteIfExists(outputFileDocs);
+ Files.deleteIfExists(outputFileWords);
+
+ // Estimate the size of the docs index data
+ offsets.transformEach(0, offsets.size(), new CountToOffsetTransformer(2));
+ IndexSizeEstimator sizeEstimator = new IndexSizeEstimator(ReverseIndexParameters.docsBTreeContext, 2);
+ offsets.fold(0, 0, offsets.size(), sizeEstimator);
+
+ // Write the docs file
+ LongArray finalDocs = LongArrayFactory.mmapForWritingConfined(outputFileDocs, sizeEstimator.size);
+ try (var intermediateDocChannel = documents.createDocumentsFileChannel()) {
+ offsets.transformEachIO(0, offsets.size(),
+ new PrioIndexBTreeTransformer(finalDocs, 2,
+ ReverseIndexParameters.docsBTreeContext,
+ intermediateDocChannel));
+ intermediateDocChannel.force(false);
+ }
+
+ LongArray wordIds = segments.wordIds;
+
+ if (offsets.size() != wordIds.size())
+ throw new IllegalStateException("Offsets and word-ids of different size");
+ if (offsets.size() > Integer.MAX_VALUE) {
+ throw new IllegalStateException("offsets.size() too big!");
+ }
+
+ // Estimate the size of the words index data
+ long wordsSize = ReverseIndexParameters.wordsBTreeContext.calculateSize((int) offsets.size());
+
+ // Construct the tree
+ LongArray wordsArray = LongArrayFactory.mmapForWritingConfined(outputFileWords, wordsSize);
+
+ new BTreeWriter(wordsArray, ReverseIndexParameters.wordsBTreeContext)
+ .write(0, (int) offsets.size(), mapRegion -> {
+ for (long i = 0; i < offsets.size(); i++) {
+ mapRegion.set(2*i, wordIds.get(i));
+ mapRegion.set(2*i + 1, offsets.get(i));
+ }
+ });
+
+ finalDocs.force();
+ finalDocs.close();
+ wordsArray.force();
+ wordsArray.close();
+
+ }
+
+ /** Delete all files associated with this pre-index */
+ public void delete() throws IOException {
+ segments.delete();
+ documents.delete();
+ }
+
+ public static PrioPreindex merge(Path destDir,
+ PrioPreindex left,
+ PrioPreindex right) throws IOException {
+
+ PrioPreindexWordSegments mergingSegment =
+ createMergedSegmentWordFile(destDir, left.segments, right.segments);
+
+ var mergingIter = mergingSegment.constructionIterator(2);
+ var leftIter = left.segments.iterator(2);
+ var rightIter = right.segments.iterator(2);
+
+ Path docsFile = Files.createTempFile(destDir, "docs", ".dat");
+
+ LongArray mergedDocuments = LongArrayFactory.mmapForWritingConfined(docsFile, left.documents.size() + right.documents.size());
+
+ leftIter.next();
+ rightIter.next();
+
+ try (FileChannel leftChannel = left.documents.createDocumentsFileChannel();
+ FileChannel rightChannel = right.documents.createDocumentsFileChannel())
+ {
+
+ while (mergingIter.canPutMore()
+ && leftIter.isPositionBeforeEnd()
+ && rightIter.isPositionBeforeEnd())
+ {
+ final long currentWord = mergingIter.wordId;
+
+ if (leftIter.wordId == currentWord && rightIter.wordId == currentWord)
+ {
+ // both inputs have documents for the current word
+ mergeSegments(leftIter, rightIter,
+ left.documents, right.documents,
+ mergedDocuments, mergingIter);
+ }
+ else if (leftIter.wordId == currentWord) {
+ if (!copySegment(leftIter, mergedDocuments, leftChannel, mergingIter))
+ break;
+ }
+ else if (rightIter.wordId == currentWord) {
+ if (!copySegment(rightIter, mergedDocuments, rightChannel, mergingIter))
+ break;
+ }
+ else assert false : "This should never happen"; // the helvetica scenario
+ }
+
+ if (leftIter.isPositionBeforeEnd()) {
+ while (copySegment(leftIter, mergedDocuments, leftChannel, mergingIter));
+ }
+
+ if (rightIter.isPositionBeforeEnd()) {
+ while (copySegment(rightIter, mergedDocuments, rightChannel, mergingIter));
+ }
+
+ }
+
+ if (leftIter.isPositionBeforeEnd())
+ throw new IllegalStateException("Left has more to go");
+ if (rightIter.isPositionBeforeEnd())
+ throw new IllegalStateException("Right has more to go");
+ if (mergingIter.canPutMore())
+ throw new IllegalStateException("Source iters ran dry before merging iter");
+
+
+ mergingSegment.force();
+
+ // We may have overestimated the size of the merged docs size in the case there were
+ // duplicates in the data, so we need to shrink it to the actual size we wrote.
+
+ mergedDocuments = shrinkMergedDocuments(mergedDocuments,
+ docsFile, 2 * mergingSegment.totalSize());
+
+ return new PrioPreindex(
+ mergingSegment,
+ new PrioPreindexDocuments(mergedDocuments, docsFile)
+ );
+ }
+
+ /** Create a segment word file with each word from both inputs, with zero counts for all the data.
+ * This is an intermediate product in merging.
+ */
+ static PrioPreindexWordSegments createMergedSegmentWordFile(Path destDir,
+ PrioPreindexWordSegments left,
+ PrioPreindexWordSegments right) throws IOException {
+ Path segmentWordsFile = Files.createTempFile(destDir, "segment_words", ".dat");
+ Path segmentCountsFile = Files.createTempFile(destDir, "segment_counts", ".dat");
+
+ // We need total size to request a direct LongArray range. Seems slower, but is faster.
+ // ... see LongArray.directRangeIfPossible(long start, long end)
+ long segmentsSize = countDistinctElements(left.wordIds, right.wordIds,
+ 0, left.wordIds.size(),
+ 0, right.wordIds.size());
+
+ LongArray wordIdsFile = LongArrayFactory.mmapForWritingConfined(segmentWordsFile, segmentsSize);
+
+ mergeArrays(wordIdsFile, left.wordIds, right.wordIds,
+ 0,
+ 0, left.wordIds.size(),
+ 0, right.wordIds.size());
+
+ LongArray counts = LongArrayFactory.mmapForWritingConfined(segmentCountsFile, segmentsSize);
+
+ return new PrioPreindexWordSegments(wordIdsFile, counts, segmentWordsFile, segmentCountsFile);
+ }
+
+ /** It's possible we overestimated the necessary size of the documents file,
+ * this will permit us to shrink it down to the smallest necessary size.
+ */
+ private static LongArray shrinkMergedDocuments(LongArray mergedDocuments, Path docsFile, long sizeLongs) throws IOException {
+
+ mergedDocuments.force();
+
+ long beforeSize = mergedDocuments.size();
+ long afterSize = sizeLongs * 8;
+ if (beforeSize != afterSize) {
+ mergedDocuments.close();
+ try (var bc = Files.newByteChannel(docsFile, StandardOpenOption.WRITE)) {
+ bc.truncate(sizeLongs * 8);
+ }
+
+ logger.info("Shrunk {} from {}b to {}b", docsFile, beforeSize, afterSize);
+ mergedDocuments = LongArrayFactory.mmapForWritingConfined(docsFile, sizeLongs);
+ }
+
+ return mergedDocuments;
+ }
+
+ /** Merge contents of the segments indicated by leftIter and rightIter into the destionation
+ * segment, and advance the construction iterator with the appropriate size.
+ */
+ private static void mergeSegments(PrioPreindexWordSegments.SegmentIterator leftIter,
+ PrioPreindexWordSegments.SegmentIterator rightIter,
+ PrioPreindexDocuments left,
+ PrioPreindexDocuments right,
+ LongArray dest,
+ PrioPreindexWordSegments.SegmentConstructionIterator destIter)
+ {
+ long segSize = mergeArrays2(dest,
+ left.documents,
+ right.documents,
+ destIter.startOffset,
+ leftIter.startOffset, leftIter.endOffset,
+ rightIter.startOffset, rightIter.endOffset);
+
+ long distinct = segSize / 2;
+ destIter.putNext(distinct);
+ leftIter.next();
+ rightIter.next();
+ }
+
+ /** Copy the data from the source segment at the position and length indicated by sourceIter,
+ * into the destination segment, and advance the construction iterator.
+ */
+ private static boolean copySegment(PrioPreindexWordSegments.SegmentIterator sourceIter,
+ LongArray dest,
+ FileChannel sourceChannel,
+ PrioPreindexWordSegments.SegmentConstructionIterator mergingIter) throws IOException {
+
+ long size = sourceIter.endOffset - sourceIter.startOffset;
+ long start = mergingIter.startOffset;
+ long end = start + size;
+
+ dest.transferFrom(sourceChannel,
+ sourceIter.startOffset,
+ mergingIter.startOffset,
+ end);
+
+ boolean putNext = mergingIter.putNext(size / 2);
+ boolean iterNext = sourceIter.next();
+
+ if (!putNext && iterNext)
+ throw new IllegalStateException("Source iterator ran out before dest iterator?!");
+
+ return iterNext;
+ }
+
+
+}
diff --git a/code/index/index-reverse/java/nu/marginalia/index/construction/prio/PrioPreindexDocuments.java b/code/index/index-reverse/java/nu/marginalia/index/construction/prio/PrioPreindexDocuments.java
new file mode 100644
index 00000000..03edb4b4
--- /dev/null
+++ b/code/index/index-reverse/java/nu/marginalia/index/construction/prio/PrioPreindexDocuments.java
@@ -0,0 +1,141 @@
+package nu.marginalia.index.construction.prio;
+
+import lombok.SneakyThrows;
+import nu.marginalia.array.LongArray;
+import nu.marginalia.array.LongArrayFactory;
+import nu.marginalia.index.construction.DocIdRewriter;
+import nu.marginalia.index.construction.PositionsFileConstructor;
+import nu.marginalia.index.journal.reader.IndexJournalReader;
+import nu.marginalia.rwf.RandomFileAssembler;
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+
+import java.io.IOException;
+import java.nio.channels.FileChannel;
+import java.nio.file.Files;
+import java.nio.file.Path;
+import java.nio.file.StandardOpenOption;
+import java.util.concurrent.ExecutorService;
+import java.util.concurrent.Executors;
+import java.util.concurrent.TimeUnit;
+
+/** A LongArray with document data, segmented according to
+ * the associated ReversePreindexWordSegments data
+ */
+public class PrioPreindexDocuments {
+ public final LongArray documents;
+
+ private static PositionsFileConstructor positionsFileConstructor;
+ private static final int RECORD_SIZE_LONGS = 2;
+ private static final Logger logger = LoggerFactory.getLogger(PrioPreindexDocuments.class);
+
+ public final Path file;
+
+ public PrioPreindexDocuments(LongArray documents, Path file) {
+ this.documents = documents;
+ this.file = file;
+ }
+
+ public static PrioPreindexDocuments construct(
+ Path docsFile,
+ Path workDir,
+ IndexJournalReader reader,
+ DocIdRewriter docIdRewriter,
+ PositionsFileConstructor positionsFileConstructor,
+ PrioPreindexWordSegments segments) throws IOException {
+ PrioPreindexDocuments.positionsFileConstructor = positionsFileConstructor;
+
+ createUnsortedDocsFile(docsFile, workDir, reader, segments, docIdRewriter);
+
+ LongArray docsFileMap = LongArrayFactory.mmapForModifyingShared(docsFile);
+ sortDocsFile(docsFileMap, segments);
+
+ return new PrioPreindexDocuments(docsFileMap, docsFile);
+ }
+
+ public FileChannel createDocumentsFileChannel() throws IOException {
+ return (FileChannel) Files.newByteChannel(file, StandardOpenOption.READ);
+ }
+
+
+ public LongArray slice(long start, long end) {
+ return documents.range(start, end);
+ }
+
+ public long size() {
+ return documents.size();
+ }
+
+ private static void createUnsortedDocsFile(Path docsFile,
+ Path workDir,
+ IndexJournalReader reader,
+ PrioPreindexWordSegments segments,
+ DocIdRewriter docIdRewriter) throws IOException {
+
+ long fileSizeLongs = RECORD_SIZE_LONGS * segments.totalSize();
+
+ try (var assembly = RandomFileAssembler.create(workDir, fileSizeLongs);
+ var pointer = reader.newPointer())
+ {
+
+ var offsetMap = segments.asMap(RECORD_SIZE_LONGS);
+ offsetMap.defaultReturnValue(0);
+
+ while (pointer.nextDocument()) {
+ long rankEncodedId = docIdRewriter.rewriteDocId(pointer.documentId());
+ for (var termData : pointer) {
+ long termId = termData.termId();
+
+ long offset = offsetMap.addTo(termId, RECORD_SIZE_LONGS);
+
+ // write position data to the positions file and get the offset
+ long encodedPosOffset = positionsFileConstructor.add((byte) termData.metadata(), termData.positionsBuffer());
+
+ assembly.put(offset + 0, rankEncodedId);
+ assembly.put(offset + 1, encodedPosOffset);
+ }
+ }
+
+ assembly.write(docsFile);
+ }
+ }
+
+ @SneakyThrows
+ private static void sortDocsFile(LongArray docsFileMap, PrioPreindexWordSegments segments) throws IOException {
+
+ var iter = segments.iterator(RECORD_SIZE_LONGS);
+
+ ExecutorService sortingWorkers = Executors.newWorkStealingPool(Runtime.getRuntime().availableProcessors());
+
+ while (iter.next()) {
+ long iterStart = iter.startOffset;
+ long iterEnd = iter.endOffset;
+
+ if (iter.size() < 1024) {
+ docsFileMap.quickSortN(RECORD_SIZE_LONGS, iterStart, iterEnd);
+ }
+ else {
+ sortingWorkers.execute(() ->
+ docsFileMap.quickSortN(RECORD_SIZE_LONGS, iterStart, iterEnd));
+ }
+ }
+
+ sortingWorkers.shutdown();
+ while (!sortingWorkers.awaitTermination(1, TimeUnit.HOURS));
+
+ sortingWorkers.close();
+ }
+
+ public void delete() throws IOException {
+ Files.delete(this.file);
+ documents.close();
+ }
+
+ public void close() {
+ documents.close();
+ }
+
+ public void force() {
+ documents.force();
+ }
+}
diff --git a/code/index/index-reverse/java/nu/marginalia/index/construction/prio/PrioPreindexReference.java b/code/index/index-reverse/java/nu/marginalia/index/construction/prio/PrioPreindexReference.java
new file mode 100644
index 00000000..10b590dd
--- /dev/null
+++ b/code/index/index-reverse/java/nu/marginalia/index/construction/prio/PrioPreindexReference.java
@@ -0,0 +1,36 @@
+package nu.marginalia.index.construction.prio;
+
+import nu.marginalia.array.LongArrayFactory;
+
+import java.io.IOException;
+import java.nio.file.Path;
+
+/** This is a dehydrated version of a PrioPreIndex, that only
+ * keeps references to its location on disk but does not hold associated
+ * memory maps.
+ */
+public record PrioPreindexReference(
+ Path wordsFile,
+ Path countsFile,
+ Path documentsFile
+)
+{
+ public PrioPreindexReference(PrioPreindexWordSegments segments, PrioPreindexDocuments documents) {
+ this(segments.wordsFile, segments.countsFile, documents.file);
+ }
+
+ public PrioPreindex open() throws IOException {
+ return new PrioPreindex(
+ new PrioPreindexWordSegments(
+ LongArrayFactory.mmapForModifyingShared(wordsFile),
+ LongArrayFactory.mmapForModifyingShared(countsFile),
+ wordsFile,
+ countsFile
+ ),
+ new PrioPreindexDocuments(
+ LongArrayFactory.mmapForModifyingShared(documentsFile),
+ documentsFile
+ )
+ );
+ }
+}
diff --git a/code/index/index-reverse/java/nu/marginalia/index/construction/prio/PrioPreindexWordSegments.java b/code/index/index-reverse/java/nu/marginalia/index/construction/prio/PrioPreindexWordSegments.java
new file mode 100644
index 00000000..512f10ff
--- /dev/null
+++ b/code/index/index-reverse/java/nu/marginalia/index/construction/prio/PrioPreindexWordSegments.java
@@ -0,0 +1,205 @@
+package nu.marginalia.index.construction.prio;
+
+import it.unimi.dsi.fastutil.longs.Long2IntOpenHashMap;
+import it.unimi.dsi.fastutil.longs.Long2LongOpenHashMap;
+import it.unimi.dsi.fastutil.longs.LongIterator;
+import nu.marginalia.array.LongArray;
+import nu.marginalia.array.LongArrayFactory;
+import nu.marginalia.index.journal.reader.IndexJournalReader;
+
+import java.io.IOException;
+import java.nio.file.Files;
+import java.nio.file.Path;
+
+/** A pair of file-backed arrays of sorted wordIds
+ * and the count of documents associated with each termId.
+ */
+public class PrioPreindexWordSegments {
+ public final LongArray wordIds;
+ public final LongArray counts;
+
+ final Path wordsFile;
+ final Path countsFile;
+
+ public PrioPreindexWordSegments(LongArray wordIds,
+ LongArray counts,
+ Path wordsFile,
+ Path countsFile)
+ {
+ assert wordIds.size() == counts.size();
+
+ this.wordIds = wordIds;
+ this.counts = counts;
+ this.wordsFile = wordsFile;
+ this.countsFile = countsFile;
+ }
+
+ /** Returns a long-long hash map where each key is a termId,
+ * and each value is the start offset of the data.
+ */
+ public Long2LongOpenHashMap asMap(int recordSize) {
+ if (wordIds.size() > Integer.MAX_VALUE)
+ throw new IllegalArgumentException("Cannot create a map with more than Integer.MAX_VALUE entries");
+
+ Long2LongOpenHashMap ret = new Long2LongOpenHashMap((int) wordIds.size(), 0.75f);
+ var iter = iterator(recordSize);
+
+ while (iter.next()) {
+ ret.put(iter.wordId, iter.startOffset);
+ }
+
+ return ret;
+ }
+
+ public static PrioPreindexWordSegments construct(IndexJournalReader reader,
+ Path wordIdsFile,
+ Path countsFile)
+ throws IOException
+ {
+ Long2IntOpenHashMap countsMap = new Long2IntOpenHashMap(100_000, 0.75f);
+ countsMap.defaultReturnValue(0);
+ reader.forEachWordId(wordId -> countsMap.addTo(wordId, 1));
+
+ LongArray words = LongArrayFactory.mmapForWritingConfined(wordIdsFile, countsMap.size());
+ LongArray counts = LongArrayFactory.mmapForWritingConfined(countsFile, countsMap.size());
+
+ // Create the words file by iterating over the map and inserting them into
+ // the words file in whatever bizarro hash table order they appear in
+ long i = 0;
+ LongIterator iter = countsMap.keySet().iterator();
+ while (iter.hasNext()) {
+ words.set(i++, iter.nextLong());
+ }
+
+ // Sort the words file
+ words.sort(0, counts.size());
+
+ // Populate the counts
+ for (i = 0; i < countsMap.size(); i++) {
+ counts.set(i, countsMap.get(words.get(i)));
+ }
+
+ return new PrioPreindexWordSegments(words, counts, wordIdsFile, countsFile);
+ }
+
+ public SegmentIterator iterator(int recordSize) {
+ return new SegmentIterator(recordSize);
+ }
+ public SegmentConstructionIterator constructionIterator(int recordSize) {
+ return new SegmentConstructionIterator(recordSize);
+ }
+
+ public long totalSize() {
+ return counts.fold(0, 0, counts.size(), Long::sum);
+ }
+
+ public void delete() throws IOException {
+ Files.delete(countsFile);
+ Files.delete(wordsFile);
+
+ counts.close();
+ wordIds.close();
+ }
+
+ public void force() {
+ counts.force();
+ wordIds.force();
+ }
+
+ public void close() {
+ wordIds.close();
+ counts.close();
+ }
+
+ public class SegmentIterator {
+ private final int recordSize;
+ private final long fileSize;
+ long wordId;
+ long startOffset = 0;
+ long endOffset = 0;
+
+ private SegmentIterator(int recordSize) {
+ this.recordSize = recordSize;
+ this.fileSize = wordIds.size();
+ }
+
+ private long i = -1;
+ public long idx() {
+ return i;
+ }
+ public boolean next() {
+ if (++i >= fileSize) {
+ wordId = Long.MIN_VALUE;
+ return false;
+ }
+
+ wordId = wordIds.get(i);
+ startOffset = endOffset;
+ endOffset = startOffset + recordSize * counts.get(i);
+
+ return true;
+ }
+
+ public boolean hasMorePositions() {
+ return i + 1 < wordIds.size();
+ }
+
+ public boolean isPositionBeforeEnd() {
+ return i < wordIds.size();
+ }
+
+ public long size() {
+ return endOffset - startOffset;
+ }
+ }
+
+ class SegmentConstructionIterator {
+ private final int recordSize;
+ private final long fileSize;
+ long wordId;
+ long startOffset = 0;
+ long endOffset = 0;
+
+ private SegmentConstructionIterator(int recordSize) {
+ this.recordSize = recordSize;
+ this.fileSize = wordIds.size();
+ if (fileSize == 0) {
+ throw new IllegalArgumentException("Cannot construct zero-length word segment file");
+ }
+ this.wordId = wordIds.get(0);
+ }
+
+ private long i = 0;
+ public long idx() {
+ return i;
+ }
+
+ public boolean putNext(long size) {
+
+ if (i >= fileSize)
+ return false;
+
+ endOffset = startOffset + recordSize * size;
+ counts.set(i, size);
+ startOffset = endOffset;
+ endOffset = -1;
+
+ i++;
+
+ if (i == fileSize) {
+ // We've reached the end of the iteration and there is no
+ // "next" termId to fetch
+ wordId = Long.MIN_VALUE;
+ return false;
+ }
+ else {
+ wordId = wordIds.get(i);
+ return true;
+ }
+ }
+
+ public boolean canPutMore() {
+ return i < wordIds.size();
+ }
+ }
+}
diff --git a/code/index/index-reverse/test/nu/marginalia/index/ReverseIndexReaderTest.java b/code/index/index-reverse/test/nu/marginalia/index/ReverseIndexReaderTest.java
index 2d53dd2e..5047da90 100644
--- a/code/index/index-reverse/test/nu/marginalia/index/ReverseIndexReaderTest.java
+++ b/code/index/index-reverse/test/nu/marginalia/index/ReverseIndexReaderTest.java
@@ -4,9 +4,9 @@ import it.unimi.dsi.fastutil.ints.IntList;
import nu.marginalia.array.page.LongQueryBuffer;
import nu.marginalia.index.construction.DocIdRewriter;
import nu.marginalia.index.construction.PositionsFileConstructor;
-import nu.marginalia.index.construction.ReversePreindex;
-import nu.marginalia.index.construction.TestJournalFactory;
-import nu.marginalia.index.construction.TestJournalFactory.EntryDataWithWordMeta;
+import nu.marginalia.index.construction.full.FullPreindex;
+import nu.marginalia.index.construction.full.TestJournalFactory;
+import nu.marginalia.index.construction.full.TestJournalFactory.EntryDataWithWordMeta;
import nu.marginalia.index.positions.PositionsFileReader;
import org.junit.jupiter.api.AfterEach;
import org.junit.jupiter.api.BeforeEach;
@@ -19,7 +19,7 @@ import java.nio.file.Path;
import java.util.ArrayList;
import java.util.List;
-import static nu.marginalia.index.construction.TestJournalFactory.wm;
+import static nu.marginalia.index.construction.full.TestJournalFactory.wm;
import static org.junit.jupiter.api.Assertions.*;
class ReverseIndexReaderTest {
@@ -99,7 +99,7 @@ class ReverseIndexReaderTest {
Path wordsFile = tempDir.resolve("words.dat");
try (var positionsFileConstructor = new PositionsFileConstructor(posFile)) {
- var preindex = ReversePreindex.constructPreindex(reader,
+ var preindex = FullPreindex.constructPreindex(reader,
positionsFileConstructor,
DocIdRewriter.identity(), tempDir);
preindex.finalizeIndex(docsFile, wordsFile);
diff --git a/code/index/index-reverse/test/nu/marginalia/index/construction/ReversePreindexDocsTest.java b/code/index/index-reverse/test/nu/marginalia/index/construction/full/FullPreindexDocsTest.java
similarity index 86%
rename from code/index/index-reverse/test/nu/marginalia/index/construction/ReversePreindexDocsTest.java
rename to code/index/index-reverse/test/nu/marginalia/index/construction/full/FullPreindexDocsTest.java
index df378228..a5c87f0f 100644
--- a/code/index/index-reverse/test/nu/marginalia/index/construction/ReversePreindexDocsTest.java
+++ b/code/index/index-reverse/test/nu/marginalia/index/construction/full/FullPreindexDocsTest.java
@@ -1,5 +1,7 @@
-package nu.marginalia.index.construction;
+package nu.marginalia.index.construction.full;
+import nu.marginalia.index.construction.DocIdRewriter;
+import nu.marginalia.index.construction.PositionsFileConstructor;
import org.junit.jupiter.api.AfterEach;
import org.junit.jupiter.api.BeforeEach;
import org.junit.jupiter.api.Test;
@@ -11,10 +13,10 @@ import java.util.ArrayList;
import java.util.Arrays;
import java.util.List;
-import static nu.marginalia.index.construction.TestJournalFactory.EntryData;
+import static nu.marginalia.index.construction.full.TestJournalFactory.EntryData;
import static org.junit.jupiter.api.Assertions.assertEquals;
-class ReversePreindexDocsTest {
+class FullPreindexDocsTest {
Path countsFile;
Path wordsIdFile;
Path docsFile;
@@ -57,8 +59,8 @@ class ReversePreindexDocsTest {
new EntryData(-0xF00BA3L, 0, 10, 40, -100, 33)
);
- var segments = ReversePreindexWordSegments.construct(reader, wordsIdFile, countsFile);
- var docs = ReversePreindexDocuments.construct(docsFile, tempDir, reader, DocIdRewriter.identity(), new PositionsFileConstructor(positionsFile), segments);
+ var segments = FullPreindexWordSegments.construct(reader, wordsIdFile, countsFile);
+ var docs = FullPreindexDocuments.construct(docsFile, tempDir, reader, DocIdRewriter.identity(), new PositionsFileConstructor(positionsFile), segments);
List