mirror of
https://github.com/MarginaliaSearch/MarginaliaSearch.git
synced 2025-02-24 13:19:02 +00:00
(index-reverse) Add documentation and clean up code.
This commit is contained in:
parent
ba4513e82c
commit
a2e6616100
4
code/features-index/index-reverse/index.svg
Normal file
4
code/features-index/index-reverse/index.svg
Normal file
File diff suppressed because one or more lines are too long
After Width: | Height: | Size: 21 KiB |
4
code/features-index/index-reverse/merging.svg
Normal file
4
code/features-index/index-reverse/merging.svg
Normal file
File diff suppressed because one or more lines are too long
After Width: | Height: | Size: 21 KiB |
4
code/features-index/index-reverse/preindex.svg
Normal file
4
code/features-index/index-reverse/preindex.svg
Normal file
File diff suppressed because one or more lines are too long
After Width: | Height: | Size: 29 KiB |
@ -12,9 +12,35 @@ The full index also provides access to term-level metadata, while the priority i
|
|||||||
[1] See WordFlags in [common/model](../../common/model/) and
|
[1] See WordFlags in [common/model](../../common/model/) and
|
||||||
KeywordMetadata in [features-convert/keyword-extraction](../../features-convert/keyword-extraction).
|
KeywordMetadata in [features-convert/keyword-extraction](../../features-convert/keyword-extraction).
|
||||||
|
|
||||||
|
## Construction
|
||||||
|
|
||||||
|
The reverse index is constructed by first building a series of preindexes.
|
||||||
|
Preindexes consist of a Segment and a Documents object. The segment contains
|
||||||
|
information about which word identifiers are present and how many, and the
|
||||||
|
documents contain information about in which documents the words can be found.
|
||||||
|
|
||||||
|

|
||||||
|
|
||||||
|
These would typically not fit in RAM, so the index journal is paged
|
||||||
|
and the preindexes are constructed small enough to fit in memory, and
|
||||||
|
then merged. Merging sorted arrays is a very fast operation that does
|
||||||
|
not require additional RAM.
|
||||||
|
|
||||||
|

|
||||||
|
|
||||||
|
Once merged into one large preindex, indexes are added to the preindex data
|
||||||
|
to form a finalized reverse index.
|
||||||
|
|
||||||
|

|
||||||
## Central Classes
|
## Central Classes
|
||||||
|
|
||||||
* [ReverseIndexFullConverter](src/main/java/nu/marginalia/index/full/ReverseIndexFullConverter.java) constructs the full index.
|
* [ReversePreindex](src/main/java/nu/marginalia/index/construction/ReversePreindex.java) intermediate reverse index state.
|
||||||
* [ReverseIndexFullReader](src/main/java/nu/marginalia/index/full/ReverseIndexFullReader.java) interrogates the full index.
|
* [ReverseIndexConstructor](src/main/java/nu/marginalia/index/construction/ReverseIndexConstructor.java) constructs the index.
|
||||||
* [ReverseIndexPriorityConverter](src/main/java/nu/marginalia/index/priority/ReverseIndexPriorityConverter.java) constructs the priority index.
|
* [ReverseIndexReader](src/main/java/nu/marginalia/index/ReverseIndexReader.java) interrogates the index.
|
||||||
* [ReverseIndexPriorityReader](src/main/java/nu/marginalia/index/priority/ReverseIndexPriorityReader.java) interrogates the priority index.
|
|
||||||
|
## See Also
|
||||||
|
|
||||||
|
* [index-journal](../index-journal)
|
||||||
|
* [index-forward](../index-forward)
|
||||||
|
* [libraries/btree](../../libraries/btree)
|
||||||
|
* [libraries/array](../../libraries/array)
|
@ -32,7 +32,7 @@ public class ReverseIndexConstructor {
|
|||||||
|
|
||||||
for (var input : inputs) {
|
for (var input : inputs) {
|
||||||
logger.info("Construcing preindex from {}", input);
|
logger.info("Construcing preindex from {}", input);
|
||||||
var preindex = ReversePreindex.constructPreindex(readerSource.construct(input), docIdRewriter, tmpDir, tmpDir);
|
var preindex = ReversePreindex.constructPreindex(readerSource.construct(input), docIdRewriter, tmpDir);
|
||||||
preindexes.add(preindex);
|
preindexes.add(preindex);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -16,9 +16,18 @@ import java.nio.file.StandardOpenOption;
|
|||||||
|
|
||||||
import static nu.marginalia.array.algo.TwoArrayOperations.*;
|
import static nu.marginalia.array.algo.TwoArrayOperations.*;
|
||||||
|
|
||||||
|
/** Contains the data that would go into a reverse index,
|
||||||
|
* that is, a mapping from words to documents, minus the actual
|
||||||
|
* index structure that makes the data quick to access while
|
||||||
|
* searching.
|
||||||
|
* <p>
|
||||||
|
* Two preindexes can be merged into a third preindex containing
|
||||||
|
* the union of their data. This operation requires no additional
|
||||||
|
* RAM.
|
||||||
|
*/
|
||||||
public class ReversePreindex {
|
public class ReversePreindex {
|
||||||
public final ReversePreindexWordSegments segments;
|
final ReversePreindexWordSegments segments;
|
||||||
public final ReversePreindexDocuments documents;
|
final ReversePreindexDocuments documents;
|
||||||
|
|
||||||
private static final Logger logger = LoggerFactory.getLogger(ReversePreindex.class);
|
private static final Logger logger = LoggerFactory.getLogger(ReversePreindex.class);
|
||||||
|
|
||||||
@ -27,6 +36,26 @@ public class ReversePreindex {
|
|||||||
this.documents = documents;
|
this.documents = documents;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/** Constructs a new preindex with the data associated with reader. The backing files
|
||||||
|
* will have randomly assigned names.
|
||||||
|
*/
|
||||||
|
public static ReversePreindex constructPreindex(IndexJournalReader reader,
|
||||||
|
DocIdRewriter docIdRewriter,
|
||||||
|
Path destDir) throws IOException
|
||||||
|
{
|
||||||
|
Path segmentWordsFile = Files.createTempFile(destDir, "segment_words", ".dat");
|
||||||
|
Path segmentCountsFile = Files.createTempFile(destDir, "segment_counts", ".dat");
|
||||||
|
Path docsFile = Files.createTempFile(destDir, "docs", ".dat");
|
||||||
|
|
||||||
|
logger.info("Segmenting");
|
||||||
|
var segments = ReversePreindexWordSegments.construct(reader, segmentWordsFile, segmentCountsFile);
|
||||||
|
logger.info("Mapping docs");
|
||||||
|
var docs = ReversePreindexDocuments.construct(docsFile, reader, docIdRewriter, segments);
|
||||||
|
logger.info("Done");
|
||||||
|
return new ReversePreindex(segments, docs);
|
||||||
|
}
|
||||||
|
|
||||||
|
/** Transform the preindex into a reverse index */
|
||||||
public void finalizeIndex(Path outputFileDocs, Path outputFileWords) throws IOException {
|
public void finalizeIndex(Path outputFileDocs, Path outputFileWords) throws IOException {
|
||||||
var offsets = segments.counts;
|
var offsets = segments.counts;
|
||||||
|
|
||||||
@ -72,22 +101,79 @@ public class ReversePreindex {
|
|||||||
segments.delete();
|
segments.delete();
|
||||||
documents.delete();
|
documents.delete();
|
||||||
}
|
}
|
||||||
public static ReversePreindex constructPreindex(IndexJournalReader reader,
|
|
||||||
DocIdRewriter docIdRewriter,
|
public static ReversePreindex merge(Path destDir,
|
||||||
Path tempDir,
|
ReversePreindex left,
|
||||||
Path destDir) throws IOException
|
ReversePreindex right) throws IOException {
|
||||||
{
|
|
||||||
Path segmentWordsFile = Files.createTempFile(destDir, "segment_words", ".dat");
|
ReversePreindexWordSegments mergingSegment =
|
||||||
Path segmentCountsFile = Files.createTempFile(destDir, "segment_counts", ".dat");
|
createMergedSegmentWordFile(destDir, left.segments, right.segments);
|
||||||
|
|
||||||
|
var mergingIter = mergingSegment.constructionIterator(2);
|
||||||
|
var leftIter = left.segments.iterator(2);
|
||||||
|
var rightIter = right.segments.iterator(2);
|
||||||
|
|
||||||
Path docsFile = Files.createTempFile(destDir, "docs", ".dat");
|
Path docsFile = Files.createTempFile(destDir, "docs", ".dat");
|
||||||
|
|
||||||
SortingContext ctx = new SortingContext(tempDir, 1<<31);
|
LongArray mergedDocuments = LongArray.mmapForWriting(docsFile, 8 * (left.documents.size() + right.documents.size()));
|
||||||
logger.info("Segmenting");
|
|
||||||
var segments = ReversePreindexWordSegments.construct(reader, ctx, segmentWordsFile, segmentCountsFile);
|
leftIter.next();
|
||||||
logger.info("Mapping docs");
|
rightIter.next();
|
||||||
var docs = ReversePreindexDocuments.construct(docsFile, reader, docIdRewriter, ctx, segments);
|
|
||||||
logger.info("Done");
|
try (FileChannel leftChannel = left.documents.createDocumentsFileChannel();
|
||||||
return new ReversePreindex(segments, docs);
|
FileChannel rightChannel = right.documents.createDocumentsFileChannel())
|
||||||
|
{
|
||||||
|
|
||||||
|
while (mergingIter.canPutMore()
|
||||||
|
&& leftIter.isPositionBeforeEnd()
|
||||||
|
&& rightIter.isPositionBeforeEnd())
|
||||||
|
{
|
||||||
|
final long currentWord = mergingIter.wordId;
|
||||||
|
|
||||||
|
if (leftIter.wordId == currentWord && rightIter.wordId == currentWord)
|
||||||
|
{
|
||||||
|
// both inputs have documents for the current word
|
||||||
|
mergeSegments(leftIter, rightIter,
|
||||||
|
left.documents, right.documents,
|
||||||
|
mergedDocuments, mergingIter);
|
||||||
|
}
|
||||||
|
else if (leftIter.wordId == currentWord) {
|
||||||
|
if (!copySegment(leftIter, mergedDocuments, leftChannel, mergingIter))
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
else if (rightIter.wordId == currentWord) {
|
||||||
|
if (!copySegment(rightIter, mergedDocuments, rightChannel, mergingIter))
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
else assert false : "This should never happen"; // the helvetica scenario
|
||||||
|
}
|
||||||
|
|
||||||
|
if (leftIter.isPositionBeforeEnd()) {
|
||||||
|
while (copySegment(leftIter, mergedDocuments, leftChannel, mergingIter));
|
||||||
|
}
|
||||||
|
|
||||||
|
if (rightIter.isPositionBeforeEnd()) {
|
||||||
|
while (copySegment(rightIter, mergedDocuments, rightChannel, mergingIter));
|
||||||
|
}
|
||||||
|
|
||||||
|
}
|
||||||
|
|
||||||
|
assert !leftIter.isPositionBeforeEnd() : "Left has more to go";
|
||||||
|
assert !rightIter.isPositionBeforeEnd() : "Right has more to go";
|
||||||
|
assert !mergingIter.canPutMore() : "Source iters ran dry before merging iter";
|
||||||
|
|
||||||
|
// We may have overestimated the size of the merged docs size in the case there were
|
||||||
|
// duplicates in the data, so we need to shrink it to the actual size we wrote.
|
||||||
|
|
||||||
|
mergedDocuments = shrinkMergedDocuments(mergedDocuments,
|
||||||
|
docsFile, 2 * mergingSegment.totalSize());
|
||||||
|
|
||||||
|
mergingSegment.force();
|
||||||
|
|
||||||
|
return new ReversePreindex(
|
||||||
|
mergingSegment,
|
||||||
|
new ReversePreindexDocuments(mergedDocuments, docsFile)
|
||||||
|
);
|
||||||
}
|
}
|
||||||
|
|
||||||
/** Create a segment word file with each word from both inputs, with zero counts for all the data.
|
/** Create a segment word file with each word from both inputs, with zero counts for all the data.
|
||||||
@ -114,79 +200,10 @@ public class ReversePreindex {
|
|||||||
|
|
||||||
return new ReversePreindexWordSegments(wordIdsFile, counts, segmentWordsFile, segmentCountsFile);
|
return new ReversePreindexWordSegments(wordIdsFile, counts, segmentWordsFile, segmentCountsFile);
|
||||||
}
|
}
|
||||||
public static ReversePreindex merge(Path destDir,
|
|
||||||
ReversePreindex left,
|
|
||||||
ReversePreindex right) throws IOException {
|
|
||||||
|
|
||||||
ReversePreindexWordSegments mergingSegment = createMergedSegmentWordFile(destDir,
|
|
||||||
left.segments,
|
|
||||||
right.segments);
|
|
||||||
|
|
||||||
var mergingIter = mergingSegment.constructionIterator(2);
|
|
||||||
var leftIter = left.segments.iterator(2);
|
|
||||||
var rightIter = right.segments.iterator(2);
|
|
||||||
|
|
||||||
Path docsFile = Files.createTempFile(destDir, "docs", ".dat");
|
|
||||||
|
|
||||||
LongArray mergedDocuments = LongArray.mmapForWriting(docsFile, 8 * (left.documents.size() + right.documents.size()));
|
|
||||||
|
|
||||||
leftIter.next();
|
|
||||||
rightIter.next();
|
|
||||||
|
|
||||||
FileChannel leftChannel = left.documents.createDocumentsFileChannel();
|
|
||||||
FileChannel rightChannel = right.documents.createDocumentsFileChannel();
|
|
||||||
|
|
||||||
while (mergingIter.canPutMore()
|
|
||||||
&& leftIter.isPositionBeforeEnd()
|
|
||||||
&& rightIter.isPositionBeforeEnd())
|
|
||||||
{
|
|
||||||
if (leftIter.wordId == mergingIter.wordId
|
|
||||||
&& rightIter.wordId == mergingIter.wordId) {
|
|
||||||
mergeSegments(leftIter,
|
|
||||||
rightIter,
|
|
||||||
left.documents,
|
|
||||||
right.documents,
|
|
||||||
mergedDocuments,
|
|
||||||
mergingIter);
|
|
||||||
}
|
|
||||||
else if (leftIter.wordId == mergingIter.wordId) {
|
|
||||||
if (!copySegment(leftIter, mergedDocuments, leftChannel, mergingIter))
|
|
||||||
break;
|
|
||||||
}
|
|
||||||
else if (rightIter.wordId == mergingIter.wordId) {
|
|
||||||
if (!copySegment(rightIter, mergedDocuments, rightChannel, mergingIter))
|
|
||||||
break;
|
|
||||||
}
|
|
||||||
else {
|
|
||||||
assert false : "This should never happen";
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
if (leftIter.isPositionBeforeEnd()) {
|
|
||||||
while (copySegment(leftIter, mergedDocuments, leftChannel, mergingIter));
|
|
||||||
|
|
||||||
}
|
|
||||||
if (rightIter.isPositionBeforeEnd()) {
|
|
||||||
while (copySegment(rightIter, mergedDocuments, rightChannel, mergingIter));
|
|
||||||
}
|
|
||||||
|
|
||||||
assert !leftIter.isPositionBeforeEnd() : "Left has more to go";
|
|
||||||
assert !rightIter.isPositionBeforeEnd() : "Right has more to go";
|
|
||||||
assert !mergingIter.canPutMore() : "Source iters ran dry before merging iter";
|
|
||||||
|
|
||||||
// We may have overestimated the size of the merged docs size in the case there were
|
|
||||||
// duplicates in the data, so we need to shrink it to the actual size we wrote.
|
|
||||||
|
|
||||||
mergedDocuments = shrinkMergedDocuments(mergedDocuments, docsFile, 2 * mergingSegment.totalSize());
|
|
||||||
|
|
||||||
mergingSegment.force();
|
|
||||||
|
|
||||||
return new ReversePreindex(
|
|
||||||
mergingSegment,
|
|
||||||
new ReversePreindexDocuments(mergedDocuments, docsFile)
|
|
||||||
);
|
|
||||||
}
|
|
||||||
|
|
||||||
|
/** It's possible we overestimated the necessary size of the documents file,
|
||||||
|
* this will permit us to shrink it down to the smallest necessary size.
|
||||||
|
*/
|
||||||
private static LongArray shrinkMergedDocuments(LongArray mergedDocuments, Path docsFile, long sizeLongs) throws IOException {
|
private static LongArray shrinkMergedDocuments(LongArray mergedDocuments, Path docsFile, long sizeLongs) throws IOException {
|
||||||
|
|
||||||
mergedDocuments.force();
|
mergedDocuments.force();
|
||||||
@ -205,12 +222,15 @@ public class ReversePreindex {
|
|||||||
return mergedDocuments;
|
return mergedDocuments;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/** Merge contents of the segments indicated by leftIter and rightIter into the destionation
|
||||||
|
* segment, and advance the construction iterator with the appropriate size.
|
||||||
|
*/
|
||||||
private static void mergeSegments(ReversePreindexWordSegments.SegmentIterator leftIter,
|
private static void mergeSegments(ReversePreindexWordSegments.SegmentIterator leftIter,
|
||||||
ReversePreindexWordSegments.SegmentIterator rightIter,
|
ReversePreindexWordSegments.SegmentIterator rightIter,
|
||||||
ReversePreindexDocuments left,
|
ReversePreindexDocuments left,
|
||||||
ReversePreindexDocuments right,
|
ReversePreindexDocuments right,
|
||||||
LongArray documentsFile,
|
LongArray dest,
|
||||||
ReversePreindexWordSegments.SegmentConstructionIterator mergingIter)
|
ReversePreindexWordSegments.SegmentConstructionIterator destIter)
|
||||||
{
|
{
|
||||||
long distinct = countDistinctElementsN(2,
|
long distinct = countDistinctElementsN(2,
|
||||||
left.documents,
|
left.documents,
|
||||||
@ -218,29 +238,32 @@ public class ReversePreindex {
|
|||||||
leftIter.startOffset, leftIter.endOffset,
|
leftIter.startOffset, leftIter.endOffset,
|
||||||
rightIter.startOffset, rightIter.endOffset);
|
rightIter.startOffset, rightIter.endOffset);
|
||||||
|
|
||||||
mergeArrays2(documentsFile,
|
mergeArrays2(dest,
|
||||||
left.documents,
|
left.documents,
|
||||||
right.documents,
|
right.documents,
|
||||||
mergingIter.startOffset,
|
destIter.startOffset,
|
||||||
mergingIter.startOffset + 2*distinct,
|
destIter.startOffset + 2*distinct,
|
||||||
leftIter.startOffset, leftIter.endOffset,
|
leftIter.startOffset, leftIter.endOffset,
|
||||||
rightIter.startOffset, rightIter.endOffset);
|
rightIter.startOffset, rightIter.endOffset);
|
||||||
|
|
||||||
mergingIter.putNext(distinct);
|
destIter.putNext(distinct);
|
||||||
leftIter.next();
|
leftIter.next();
|
||||||
rightIter.next();
|
rightIter.next();
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/** Copy the data from the source segment at the position and length indicated by sourceIter,
|
||||||
|
* into the destination segment, and advance the construction iterator.
|
||||||
|
*/
|
||||||
private static boolean copySegment(ReversePreindexWordSegments.SegmentIterator sourceIter,
|
private static boolean copySegment(ReversePreindexWordSegments.SegmentIterator sourceIter,
|
||||||
LongArray documentsFile,
|
LongArray dest,
|
||||||
FileChannel leftChannel,
|
FileChannel sourceChannel,
|
||||||
ReversePreindexWordSegments.SegmentConstructionIterator mergingIter) throws IOException {
|
ReversePreindexWordSegments.SegmentConstructionIterator mergingIter) throws IOException {
|
||||||
|
|
||||||
long size = sourceIter.endOffset - sourceIter.startOffset;
|
long size = sourceIter.endOffset - sourceIter.startOffset;
|
||||||
long start = mergingIter.startOffset;
|
long start = mergingIter.startOffset;
|
||||||
long end = start + size;
|
long end = start + size;
|
||||||
|
|
||||||
documentsFile.transferFrom(leftChannel,
|
dest.transferFrom(sourceChannel,
|
||||||
sourceIter.startOffset,
|
sourceIter.startOffset,
|
||||||
mergingIter.startOffset,
|
mergingIter.startOffset,
|
||||||
end);
|
end);
|
||||||
@ -248,12 +271,9 @@ public class ReversePreindex {
|
|||||||
boolean putNext = mergingIter.putNext(size / 2);
|
boolean putNext = mergingIter.putNext(size / 2);
|
||||||
boolean iterNext = sourceIter.next();
|
boolean iterNext = sourceIter.next();
|
||||||
|
|
||||||
if (!putNext) {
|
assert putNext || !iterNext : "Source iterator ran out before dest iterator?!";
|
||||||
assert !iterNext: "Source iterator ran out before dest iterator?!";
|
|
||||||
}
|
|
||||||
|
|
||||||
return iterNext;
|
return iterNext;
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
|
@ -34,7 +34,6 @@ public class ReversePreindexDocuments {
|
|||||||
Path docsFile,
|
Path docsFile,
|
||||||
IndexJournalReader reader,
|
IndexJournalReader reader,
|
||||||
DocIdRewriter docIdRewriter,
|
DocIdRewriter docIdRewriter,
|
||||||
SortingContext sortingContext,
|
|
||||||
ReversePreindexWordSegments segments) throws IOException {
|
ReversePreindexWordSegments segments) throws IOException {
|
||||||
|
|
||||||
|
|
||||||
@ -43,7 +42,7 @@ public class ReversePreindexDocuments {
|
|||||||
|
|
||||||
LongArray docsFileMap = LongArray.mmapForWriting(docsFile, 8 * Files.size(docsFile));
|
LongArray docsFileMap = LongArray.mmapForWriting(docsFile, 8 * Files.size(docsFile));
|
||||||
logger.info("Sorting data");
|
logger.info("Sorting data");
|
||||||
sortDocsFile(docsFileMap, segments, sortingContext);
|
sortDocsFile(docsFileMap, segments);
|
||||||
|
|
||||||
return new ReversePreindexDocuments(docsFileMap, docsFile);
|
return new ReversePreindexDocuments(docsFileMap, docsFile);
|
||||||
}
|
}
|
||||||
@ -90,7 +89,7 @@ public class ReversePreindexDocuments {
|
|||||||
}
|
}
|
||||||
|
|
||||||
@SneakyThrows
|
@SneakyThrows
|
||||||
private static void sortDocsFile(LongArray docsFileMap, ReversePreindexWordSegments segments, SortingContext sortingContext) throws IOException {
|
private static void sortDocsFile(LongArray docsFileMap, ReversePreindexWordSegments segments) throws IOException {
|
||||||
|
|
||||||
var iter = segments.iterator(RECORD_SIZE_LONGS);
|
var iter = segments.iterator(RECORD_SIZE_LONGS);
|
||||||
|
|
||||||
|
@ -51,7 +51,6 @@ public class ReversePreindexWordSegments {
|
|||||||
}
|
}
|
||||||
|
|
||||||
public static ReversePreindexWordSegments construct(IndexJournalReader reader,
|
public static ReversePreindexWordSegments construct(IndexJournalReader reader,
|
||||||
SortingContext ctx,
|
|
||||||
Path wordIdsFile,
|
Path wordIdsFile,
|
||||||
Path countsFile)
|
Path countsFile)
|
||||||
throws IOException
|
throws IOException
|
||||||
@ -73,7 +72,7 @@ public class ReversePreindexWordSegments {
|
|||||||
}
|
}
|
||||||
|
|
||||||
// Sort the words file
|
// Sort the words file
|
||||||
words.sortLargeSpan(ctx, 0, counts.size());
|
words.quickSort(0, counts.size());
|
||||||
|
|
||||||
// Populate the counts
|
// Populate the counts
|
||||||
for (i = 0; i < countsMap.size(); i++) {
|
for (i = 0; i < countsMap.size(); i++) {
|
||||||
|
@ -94,7 +94,7 @@ class ReverseIndexReaderTest {
|
|||||||
|
|
||||||
private ReverseIndexReader createIndex(EntryDataWithWordMeta... scenario) throws IOException {
|
private ReverseIndexReader createIndex(EntryDataWithWordMeta... scenario) throws IOException {
|
||||||
var reader = journalFactory.createReader(scenario);
|
var reader = journalFactory.createReader(scenario);
|
||||||
var preindex = ReversePreindex.constructPreindex(reader, DocIdRewriter.identity(), tempDir, tempDir);
|
var preindex = ReversePreindex.constructPreindex(reader, DocIdRewriter.identity(), tempDir);
|
||||||
|
|
||||||
|
|
||||||
Path docsFile = tempDir.resolve("docs.dat");
|
Path docsFile = tempDir.resolve("docs.dat");
|
||||||
|
@ -20,7 +20,6 @@ class ReversePreindexDocsTest {
|
|||||||
Path wordsIdFile;
|
Path wordsIdFile;
|
||||||
Path docsFile;
|
Path docsFile;
|
||||||
Path tempDir;
|
Path tempDir;
|
||||||
SortingContext sortingContext;
|
|
||||||
|
|
||||||
TestJournalFactory journalFactory;
|
TestJournalFactory journalFactory;
|
||||||
|
|
||||||
@ -32,7 +31,6 @@ class ReversePreindexDocsTest {
|
|||||||
wordsIdFile = Files.createTempFile("words", ".dat");
|
wordsIdFile = Files.createTempFile("words", ".dat");
|
||||||
docsFile = Files.createTempFile("docs", ".dat");
|
docsFile = Files.createTempFile("docs", ".dat");
|
||||||
tempDir = Files.createTempDirectory("sort");
|
tempDir = Files.createTempDirectory("sort");
|
||||||
sortingContext = new SortingContext(Path.of("invalid"), 1<<20);
|
|
||||||
}
|
}
|
||||||
|
|
||||||
@AfterEach
|
@AfterEach
|
||||||
@ -55,8 +53,8 @@ class ReversePreindexDocsTest {
|
|||||||
new EntryData(-0xF00BA3L, 0, 10, 40, -100, 33)
|
new EntryData(-0xF00BA3L, 0, 10, 40, -100, 33)
|
||||||
);
|
);
|
||||||
|
|
||||||
var segments = ReversePreindexWordSegments.construct(reader, sortingContext, wordsIdFile, countsFile);
|
var segments = ReversePreindexWordSegments.construct(reader, wordsIdFile, countsFile);
|
||||||
var docs = ReversePreindexDocuments.construct(docsFile, reader, DocIdRewriter.identity(), sortingContext, segments);
|
var docs = ReversePreindexDocuments.construct(docsFile, reader, DocIdRewriter.identity(), segments);
|
||||||
|
|
||||||
List<TestSegmentData> expected = List.of(
|
List<TestSegmentData> expected = List.of(
|
||||||
new TestSegmentData(-100, 0, 2, new long[] { -0xF00BA3L, 0 }),
|
new TestSegmentData(-100, 0, 2, new long[] { -0xF00BA3L, 0 }),
|
||||||
@ -84,8 +82,8 @@ class ReversePreindexDocsTest {
|
|||||||
new EntryData(-0xF00BA3L, 0, 4, 4)
|
new EntryData(-0xF00BA3L, 0, 4, 4)
|
||||||
);
|
);
|
||||||
|
|
||||||
var segments = ReversePreindexWordSegments.construct(reader, sortingContext, wordsIdFile, countsFile);
|
var segments = ReversePreindexWordSegments.construct(reader, wordsIdFile, countsFile);
|
||||||
var docs = ReversePreindexDocuments.construct(docsFile, reader, DocIdRewriter.identity(), sortingContext, segments);
|
var docs = ReversePreindexDocuments.construct(docsFile, reader, DocIdRewriter.identity(), segments);
|
||||||
|
|
||||||
List<TestSegmentData> expected = List.of(
|
List<TestSegmentData> expected = List.of(
|
||||||
new TestSegmentData(4, 0, 4, new long[] { -0xF00BA3L, 0, -0xF00BA3L, 0 })
|
new TestSegmentData(4, 0, 4, new long[] { -0xF00BA3L, 0, -0xF00BA3L, 0 })
|
||||||
@ -110,8 +108,8 @@ class ReversePreindexDocsTest {
|
|||||||
new EntryData(0xF00BA4L, 0, 15, 30, -100, 33)
|
new EntryData(0xF00BA4L, 0, 15, 30, -100, 33)
|
||||||
);
|
);
|
||||||
|
|
||||||
var segments = ReversePreindexWordSegments.construct(reader, sortingContext, wordsIdFile, countsFile);
|
var segments = ReversePreindexWordSegments.construct(reader, wordsIdFile, countsFile);
|
||||||
var docs = ReversePreindexDocuments.construct(docsFile, reader, DocIdRewriter.identity(), sortingContext, segments);
|
var docs = ReversePreindexDocuments.construct(docsFile, reader, DocIdRewriter.identity(), segments);
|
||||||
|
|
||||||
List<TestSegmentData> expected = List.of(
|
List<TestSegmentData> expected = List.of(
|
||||||
new TestSegmentData(-100, 0, 4, new long[] { -0xF00BA3L, 0, 0xF00BA4L, 0 }),
|
new TestSegmentData(-100, 0, 4, new long[] { -0xF00BA3L, 0, 0xF00BA4L, 0 }),
|
||||||
|
@ -54,7 +54,7 @@ class ReversePreindexFinalizeTest {
|
|||||||
@Test
|
@Test
|
||||||
public void testFinalizeSimple() throws IOException {
|
public void testFinalizeSimple() throws IOException {
|
||||||
var reader = journalFactory.createReader(new EntryDataWithWordMeta(100, 101, wm(50, 51)));
|
var reader = journalFactory.createReader(new EntryDataWithWordMeta(100, 101, wm(50, 51)));
|
||||||
var preindex = ReversePreindex.constructPreindex(reader, DocIdRewriter.identity(), tempDir, tempDir);
|
var preindex = ReversePreindex.constructPreindex(reader, DocIdRewriter.identity(), tempDir);
|
||||||
|
|
||||||
|
|
||||||
preindex.finalizeIndex(tempDir.resolve( "docs.dat"), tempDir.resolve("words.dat"));
|
preindex.finalizeIndex(tempDir.resolve( "docs.dat"), tempDir.resolve("words.dat"));
|
||||||
@ -92,7 +92,7 @@ class ReversePreindexFinalizeTest {
|
|||||||
new EntryDataWithWordMeta(101, 101, wm(51, 52))
|
new EntryDataWithWordMeta(101, 101, wm(51, 52))
|
||||||
);
|
);
|
||||||
|
|
||||||
var preindex = ReversePreindex.constructPreindex(reader, DocIdRewriter.identity(), tempDir, tempDir);
|
var preindex = ReversePreindex.constructPreindex(reader, DocIdRewriter.identity(), tempDir);
|
||||||
|
|
||||||
preindex.finalizeIndex(tempDir.resolve( "docs.dat"), tempDir.resolve("words.dat"));
|
preindex.finalizeIndex(tempDir.resolve( "docs.dat"), tempDir.resolve("words.dat"));
|
||||||
preindex.delete();
|
preindex.delete();
|
||||||
|
@ -54,8 +54,8 @@ class ReversePreindexMergeTest {
|
|||||||
var reader1 = journalFactory.createReader(leftData.toArray(EntryDataWithWordMeta[]::new));
|
var reader1 = journalFactory.createReader(leftData.toArray(EntryDataWithWordMeta[]::new));
|
||||||
var reader2 = journalFactory.createReader(rightData.toArray(EntryDataWithWordMeta[]::new));
|
var reader2 = journalFactory.createReader(rightData.toArray(EntryDataWithWordMeta[]::new));
|
||||||
|
|
||||||
var left = ReversePreindex.constructPreindex(reader1, DocIdRewriter.identity(), tempDir, tempDir);
|
var left = ReversePreindex.constructPreindex(reader1, DocIdRewriter.identity(), tempDir);
|
||||||
var right = ReversePreindex.constructPreindex(reader2, DocIdRewriter.identity(), tempDir, tempDir);
|
var right = ReversePreindex.constructPreindex(reader2, DocIdRewriter.identity(), tempDir);
|
||||||
return ReversePreindex.merge(tempDir, left, right);
|
return ReversePreindex.merge(tempDir, left, right);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -22,7 +22,6 @@ class ReversePreindexWordSegmentsTest {
|
|||||||
Path tempDir;
|
Path tempDir;
|
||||||
|
|
||||||
TestJournalFactory journalFactory;
|
TestJournalFactory journalFactory;
|
||||||
SortingContext sortingContext;
|
|
||||||
|
|
||||||
@BeforeEach
|
@BeforeEach
|
||||||
public void setUp() throws IOException {
|
public void setUp() throws IOException {
|
||||||
@ -32,7 +31,6 @@ class ReversePreindexWordSegmentsTest {
|
|||||||
wordsIdFile = Files.createTempFile("words", ".dat");
|
wordsIdFile = Files.createTempFile("words", ".dat");
|
||||||
docsFile = Files.createTempFile("docs", ".dat");
|
docsFile = Files.createTempFile("docs", ".dat");
|
||||||
tempDir = Files.createTempDirectory("sort");
|
tempDir = Files.createTempDirectory("sort");
|
||||||
sortingContext = new SortingContext(Path.of("invalid"), 1<<20);
|
|
||||||
}
|
}
|
||||||
|
|
||||||
@AfterEach
|
@AfterEach
|
||||||
@ -54,7 +52,7 @@ class ReversePreindexWordSegmentsTest {
|
|||||||
new EntryData(-0xF00BA3L, 0, 1L<<33)
|
new EntryData(-0xF00BA3L, 0, 1L<<33)
|
||||||
);
|
);
|
||||||
|
|
||||||
var segments = ReversePreindexWordSegments.construct(reader, sortingContext, wordsIdFile, countsFile);
|
var segments = ReversePreindexWordSegments.construct(reader, wordsIdFile, countsFile);
|
||||||
var iter = segments.iterator(1);
|
var iter = segments.iterator(1);
|
||||||
|
|
||||||
List<TestSegmentData> expected = List.of(
|
List<TestSegmentData> expected = List.of(
|
||||||
@ -75,7 +73,7 @@ class ReversePreindexWordSegmentsTest {
|
|||||||
new EntryData(-0xF00BA3L, 0, 5, 5)
|
new EntryData(-0xF00BA3L, 0, 5, 5)
|
||||||
);
|
);
|
||||||
|
|
||||||
var segments = ReversePreindexWordSegments.construct(reader, sortingContext, wordsIdFile, countsFile);
|
var segments = ReversePreindexWordSegments.construct(reader, wordsIdFile, countsFile);
|
||||||
var iter = segments.iterator(1);
|
var iter = segments.iterator(1);
|
||||||
|
|
||||||
List<TestSegmentData> expected = List.of(
|
List<TestSegmentData> expected = List.of(
|
||||||
@ -97,7 +95,7 @@ class ReversePreindexWordSegmentsTest {
|
|||||||
new EntryData(-0xF00BA3L, 0, 10, 40, -100, 33)
|
new EntryData(-0xF00BA3L, 0, 10, 40, -100, 33)
|
||||||
);
|
);
|
||||||
|
|
||||||
var segments = ReversePreindexWordSegments.construct(reader, sortingContext, wordsIdFile, countsFile);
|
var segments = ReversePreindexWordSegments.construct(reader, wordsIdFile, countsFile);
|
||||||
var iter = segments.iterator(1);
|
var iter = segments.iterator(1);
|
||||||
|
|
||||||
List<TestSegmentData> expected = List.of(
|
List<TestSegmentData> expected = List.of(
|
||||||
@ -123,7 +121,7 @@ class ReversePreindexWordSegmentsTest {
|
|||||||
new EntryData(0xF00BA4L, 0, 15, 30, -100, 33)
|
new EntryData(0xF00BA4L, 0, 15, 30, -100, 33)
|
||||||
);
|
);
|
||||||
|
|
||||||
var segments = ReversePreindexWordSegments.construct(reader, sortingContext, wordsIdFile, countsFile);
|
var segments = ReversePreindexWordSegments.construct(reader, wordsIdFile, countsFile);
|
||||||
var iter = segments.iterator(1);
|
var iter = segments.iterator(1);
|
||||||
|
|
||||||
List<TestSegmentData> expected = List.of(
|
List<TestSegmentData> expected = List.of(
|
||||||
|
Loading…
Reference in New Issue
Block a user