(index-reverse) Add documentation and clean up code.

This commit is contained in:
Viktor Lofgren 2023-08-29 11:35:54 +02:00
parent ba4513e82c
commit a2e6616100
13 changed files with 184 additions and 132 deletions

File diff suppressed because one or more lines are too long

After

Width:  |  Height:  |  Size: 21 KiB

File diff suppressed because one or more lines are too long

After

Width:  |  Height:  |  Size: 21 KiB

File diff suppressed because one or more lines are too long

After

Width:  |  Height:  |  Size: 29 KiB

View File

@ -12,9 +12,35 @@ The full index also provides access to term-level metadata, while the priority i
[1] See WordFlags in [common/model](../../common/model/) and
KeywordMetadata in [features-convert/keyword-extraction](../../features-convert/keyword-extraction).
## Construction
The reverse index is constructed by first building a series of preindexes.
Preindexes consist of a Segment and a Documents object. The segment contains
information about which word identifiers are present and how many, and the
documents contain information about in which documents the words can be found.
![Memory layout illustrations](./preindex.svg)
These would typically not fit in RAM, so the index journal is paged
and the preindexes are constructed small enough to fit in memory, and
then merged. Merging sorted arrays is a very fast operation that does
not require additional RAM.
![Illustration of successively merged preindex files](./merging.svg)
Once merged into one large preindex, indexes are added to the preindex data
to form a finalized reverse index.
![Illustration of the data layout of the finalized index](index.svg)
## Central Classes
* [ReverseIndexFullConverter](src/main/java/nu/marginalia/index/full/ReverseIndexFullConverter.java) constructs the full index.
* [ReverseIndexFullReader](src/main/java/nu/marginalia/index/full/ReverseIndexFullReader.java) interrogates the full index.
* [ReverseIndexPriorityConverter](src/main/java/nu/marginalia/index/priority/ReverseIndexPriorityConverter.java) constructs the priority index.
* [ReverseIndexPriorityReader](src/main/java/nu/marginalia/index/priority/ReverseIndexPriorityReader.java) interrogates the priority index.
* [ReversePreindex](src/main/java/nu/marginalia/index/construction/ReversePreindex.java) intermediate reverse index state.
* [ReverseIndexConstructor](src/main/java/nu/marginalia/index/construction/ReverseIndexConstructor.java) constructs the index.
* [ReverseIndexReader](src/main/java/nu/marginalia/index/ReverseIndexReader.java) interrogates the index.
## See Also
* [index-journal](../index-journal)
* [index-forward](../index-forward)
* [libraries/btree](../../libraries/btree)
* [libraries/array](../../libraries/array)

View File

@ -32,7 +32,7 @@ public class ReverseIndexConstructor {
for (var input : inputs) {
logger.info("Construcing preindex from {}", input);
var preindex = ReversePreindex.constructPreindex(readerSource.construct(input), docIdRewriter, tmpDir, tmpDir);
var preindex = ReversePreindex.constructPreindex(readerSource.construct(input), docIdRewriter, tmpDir);
preindexes.add(preindex);
}

View File

@ -16,9 +16,18 @@ import java.nio.file.StandardOpenOption;
import static nu.marginalia.array.algo.TwoArrayOperations.*;
/** Contains the data that would go into a reverse index,
* that is, a mapping from words to documents, minus the actual
* index structure that makes the data quick to access while
* searching.
* <p>
* Two preindexes can be merged into a third preindex containing
* the union of their data. This operation requires no additional
* RAM.
*/
public class ReversePreindex {
public final ReversePreindexWordSegments segments;
public final ReversePreindexDocuments documents;
final ReversePreindexWordSegments segments;
final ReversePreindexDocuments documents;
private static final Logger logger = LoggerFactory.getLogger(ReversePreindex.class);
@ -27,6 +36,26 @@ public class ReversePreindex {
this.documents = documents;
}
/** Constructs a new preindex with the data associated with reader. The backing files
* will have randomly assigned names.
*/
public static ReversePreindex constructPreindex(IndexJournalReader reader,
DocIdRewriter docIdRewriter,
Path destDir) throws IOException
{
Path segmentWordsFile = Files.createTempFile(destDir, "segment_words", ".dat");
Path segmentCountsFile = Files.createTempFile(destDir, "segment_counts", ".dat");
Path docsFile = Files.createTempFile(destDir, "docs", ".dat");
logger.info("Segmenting");
var segments = ReversePreindexWordSegments.construct(reader, segmentWordsFile, segmentCountsFile);
logger.info("Mapping docs");
var docs = ReversePreindexDocuments.construct(docsFile, reader, docIdRewriter, segments);
logger.info("Done");
return new ReversePreindex(segments, docs);
}
/** Transform the preindex into a reverse index */
public void finalizeIndex(Path outputFileDocs, Path outputFileWords) throws IOException {
var offsets = segments.counts;
@ -72,22 +101,79 @@ public class ReversePreindex {
segments.delete();
documents.delete();
}
public static ReversePreindex constructPreindex(IndexJournalReader reader,
DocIdRewriter docIdRewriter,
Path tempDir,
Path destDir) throws IOException
{
Path segmentWordsFile = Files.createTempFile(destDir, "segment_words", ".dat");
Path segmentCountsFile = Files.createTempFile(destDir, "segment_counts", ".dat");
public static ReversePreindex merge(Path destDir,
ReversePreindex left,
ReversePreindex right) throws IOException {
ReversePreindexWordSegments mergingSegment =
createMergedSegmentWordFile(destDir, left.segments, right.segments);
var mergingIter = mergingSegment.constructionIterator(2);
var leftIter = left.segments.iterator(2);
var rightIter = right.segments.iterator(2);
Path docsFile = Files.createTempFile(destDir, "docs", ".dat");
SortingContext ctx = new SortingContext(tempDir, 1<<31);
logger.info("Segmenting");
var segments = ReversePreindexWordSegments.construct(reader, ctx, segmentWordsFile, segmentCountsFile);
logger.info("Mapping docs");
var docs = ReversePreindexDocuments.construct(docsFile, reader, docIdRewriter, ctx, segments);
logger.info("Done");
return new ReversePreindex(segments, docs);
LongArray mergedDocuments = LongArray.mmapForWriting(docsFile, 8 * (left.documents.size() + right.documents.size()));
leftIter.next();
rightIter.next();
try (FileChannel leftChannel = left.documents.createDocumentsFileChannel();
FileChannel rightChannel = right.documents.createDocumentsFileChannel())
{
while (mergingIter.canPutMore()
&& leftIter.isPositionBeforeEnd()
&& rightIter.isPositionBeforeEnd())
{
final long currentWord = mergingIter.wordId;
if (leftIter.wordId == currentWord && rightIter.wordId == currentWord)
{
// both inputs have documents for the current word
mergeSegments(leftIter, rightIter,
left.documents, right.documents,
mergedDocuments, mergingIter);
}
else if (leftIter.wordId == currentWord) {
if (!copySegment(leftIter, mergedDocuments, leftChannel, mergingIter))
break;
}
else if (rightIter.wordId == currentWord) {
if (!copySegment(rightIter, mergedDocuments, rightChannel, mergingIter))
break;
}
else assert false : "This should never happen"; // the helvetica scenario
}
if (leftIter.isPositionBeforeEnd()) {
while (copySegment(leftIter, mergedDocuments, leftChannel, mergingIter));
}
if (rightIter.isPositionBeforeEnd()) {
while (copySegment(rightIter, mergedDocuments, rightChannel, mergingIter));
}
}
assert !leftIter.isPositionBeforeEnd() : "Left has more to go";
assert !rightIter.isPositionBeforeEnd() : "Right has more to go";
assert !mergingIter.canPutMore() : "Source iters ran dry before merging iter";
// We may have overestimated the size of the merged docs size in the case there were
// duplicates in the data, so we need to shrink it to the actual size we wrote.
mergedDocuments = shrinkMergedDocuments(mergedDocuments,
docsFile, 2 * mergingSegment.totalSize());
mergingSegment.force();
return new ReversePreindex(
mergingSegment,
new ReversePreindexDocuments(mergedDocuments, docsFile)
);
}
/** Create a segment word file with each word from both inputs, with zero counts for all the data.
@ -114,79 +200,10 @@ public class ReversePreindex {
return new ReversePreindexWordSegments(wordIdsFile, counts, segmentWordsFile, segmentCountsFile);
}
public static ReversePreindex merge(Path destDir,
ReversePreindex left,
ReversePreindex right) throws IOException {
ReversePreindexWordSegments mergingSegment = createMergedSegmentWordFile(destDir,
left.segments,
right.segments);
var mergingIter = mergingSegment.constructionIterator(2);
var leftIter = left.segments.iterator(2);
var rightIter = right.segments.iterator(2);
Path docsFile = Files.createTempFile(destDir, "docs", ".dat");
LongArray mergedDocuments = LongArray.mmapForWriting(docsFile, 8 * (left.documents.size() + right.documents.size()));
leftIter.next();
rightIter.next();
FileChannel leftChannel = left.documents.createDocumentsFileChannel();
FileChannel rightChannel = right.documents.createDocumentsFileChannel();
while (mergingIter.canPutMore()
&& leftIter.isPositionBeforeEnd()
&& rightIter.isPositionBeforeEnd())
{
if (leftIter.wordId == mergingIter.wordId
&& rightIter.wordId == mergingIter.wordId) {
mergeSegments(leftIter,
rightIter,
left.documents,
right.documents,
mergedDocuments,
mergingIter);
}
else if (leftIter.wordId == mergingIter.wordId) {
if (!copySegment(leftIter, mergedDocuments, leftChannel, mergingIter))
break;
}
else if (rightIter.wordId == mergingIter.wordId) {
if (!copySegment(rightIter, mergedDocuments, rightChannel, mergingIter))
break;
}
else {
assert false : "This should never happen";
}
}
if (leftIter.isPositionBeforeEnd()) {
while (copySegment(leftIter, mergedDocuments, leftChannel, mergingIter));
}
if (rightIter.isPositionBeforeEnd()) {
while (copySegment(rightIter, mergedDocuments, rightChannel, mergingIter));
}
assert !leftIter.isPositionBeforeEnd() : "Left has more to go";
assert !rightIter.isPositionBeforeEnd() : "Right has more to go";
assert !mergingIter.canPutMore() : "Source iters ran dry before merging iter";
// We may have overestimated the size of the merged docs size in the case there were
// duplicates in the data, so we need to shrink it to the actual size we wrote.
mergedDocuments = shrinkMergedDocuments(mergedDocuments, docsFile, 2 * mergingSegment.totalSize());
mergingSegment.force();
return new ReversePreindex(
mergingSegment,
new ReversePreindexDocuments(mergedDocuments, docsFile)
);
}
/** It's possible we overestimated the necessary size of the documents file,
* this will permit us to shrink it down to the smallest necessary size.
*/
private static LongArray shrinkMergedDocuments(LongArray mergedDocuments, Path docsFile, long sizeLongs) throws IOException {
mergedDocuments.force();
@ -205,12 +222,15 @@ public class ReversePreindex {
return mergedDocuments;
}
/** Merge contents of the segments indicated by leftIter and rightIter into the destionation
* segment, and advance the construction iterator with the appropriate size.
*/
private static void mergeSegments(ReversePreindexWordSegments.SegmentIterator leftIter,
ReversePreindexWordSegments.SegmentIterator rightIter,
ReversePreindexDocuments left,
ReversePreindexDocuments right,
LongArray documentsFile,
ReversePreindexWordSegments.SegmentConstructionIterator mergingIter)
LongArray dest,
ReversePreindexWordSegments.SegmentConstructionIterator destIter)
{
long distinct = countDistinctElementsN(2,
left.documents,
@ -218,29 +238,32 @@ public class ReversePreindex {
leftIter.startOffset, leftIter.endOffset,
rightIter.startOffset, rightIter.endOffset);
mergeArrays2(documentsFile,
mergeArrays2(dest,
left.documents,
right.documents,
mergingIter.startOffset,
mergingIter.startOffset + 2*distinct,
destIter.startOffset,
destIter.startOffset + 2*distinct,
leftIter.startOffset, leftIter.endOffset,
rightIter.startOffset, rightIter.endOffset);
mergingIter.putNext(distinct);
destIter.putNext(distinct);
leftIter.next();
rightIter.next();
}
/** Copy the data from the source segment at the position and length indicated by sourceIter,
* into the destination segment, and advance the construction iterator.
*/
private static boolean copySegment(ReversePreindexWordSegments.SegmentIterator sourceIter,
LongArray documentsFile,
FileChannel leftChannel,
LongArray dest,
FileChannel sourceChannel,
ReversePreindexWordSegments.SegmentConstructionIterator mergingIter) throws IOException {
long size = sourceIter.endOffset - sourceIter.startOffset;
long start = mergingIter.startOffset;
long end = start + size;
documentsFile.transferFrom(leftChannel,
dest.transferFrom(sourceChannel,
sourceIter.startOffset,
mergingIter.startOffset,
end);
@ -248,12 +271,9 @@ public class ReversePreindex {
boolean putNext = mergingIter.putNext(size / 2);
boolean iterNext = sourceIter.next();
if (!putNext) {
assert !iterNext: "Source iterator ran out before dest iterator?!";
}
assert putNext || !iterNext : "Source iterator ran out before dest iterator?!";
return iterNext;
}

View File

@ -34,7 +34,6 @@ public class ReversePreindexDocuments {
Path docsFile,
IndexJournalReader reader,
DocIdRewriter docIdRewriter,
SortingContext sortingContext,
ReversePreindexWordSegments segments) throws IOException {
@ -43,7 +42,7 @@ public class ReversePreindexDocuments {
LongArray docsFileMap = LongArray.mmapForWriting(docsFile, 8 * Files.size(docsFile));
logger.info("Sorting data");
sortDocsFile(docsFileMap, segments, sortingContext);
sortDocsFile(docsFileMap, segments);
return new ReversePreindexDocuments(docsFileMap, docsFile);
}
@ -90,7 +89,7 @@ public class ReversePreindexDocuments {
}
@SneakyThrows
private static void sortDocsFile(LongArray docsFileMap, ReversePreindexWordSegments segments, SortingContext sortingContext) throws IOException {
private static void sortDocsFile(LongArray docsFileMap, ReversePreindexWordSegments segments) throws IOException {
var iter = segments.iterator(RECORD_SIZE_LONGS);

View File

@ -51,7 +51,6 @@ public class ReversePreindexWordSegments {
}
public static ReversePreindexWordSegments construct(IndexJournalReader reader,
SortingContext ctx,
Path wordIdsFile,
Path countsFile)
throws IOException
@ -73,7 +72,7 @@ public class ReversePreindexWordSegments {
}
// Sort the words file
words.sortLargeSpan(ctx, 0, counts.size());
words.quickSort(0, counts.size());
// Populate the counts
for (i = 0; i < countsMap.size(); i++) {

View File

@ -94,7 +94,7 @@ class ReverseIndexReaderTest {
private ReverseIndexReader createIndex(EntryDataWithWordMeta... scenario) throws IOException {
var reader = journalFactory.createReader(scenario);
var preindex = ReversePreindex.constructPreindex(reader, DocIdRewriter.identity(), tempDir, tempDir);
var preindex = ReversePreindex.constructPreindex(reader, DocIdRewriter.identity(), tempDir);
Path docsFile = tempDir.resolve("docs.dat");

View File

@ -20,7 +20,6 @@ class ReversePreindexDocsTest {
Path wordsIdFile;
Path docsFile;
Path tempDir;
SortingContext sortingContext;
TestJournalFactory journalFactory;
@ -32,7 +31,6 @@ class ReversePreindexDocsTest {
wordsIdFile = Files.createTempFile("words", ".dat");
docsFile = Files.createTempFile("docs", ".dat");
tempDir = Files.createTempDirectory("sort");
sortingContext = new SortingContext(Path.of("invalid"), 1<<20);
}
@AfterEach
@ -55,8 +53,8 @@ class ReversePreindexDocsTest {
new EntryData(-0xF00BA3L, 0, 10, 40, -100, 33)
);
var segments = ReversePreindexWordSegments.construct(reader, sortingContext, wordsIdFile, countsFile);
var docs = ReversePreindexDocuments.construct(docsFile, reader, DocIdRewriter.identity(), sortingContext, segments);
var segments = ReversePreindexWordSegments.construct(reader, wordsIdFile, countsFile);
var docs = ReversePreindexDocuments.construct(docsFile, reader, DocIdRewriter.identity(), segments);
List<TestSegmentData> expected = List.of(
new TestSegmentData(-100, 0, 2, new long[] { -0xF00BA3L, 0 }),
@ -84,8 +82,8 @@ class ReversePreindexDocsTest {
new EntryData(-0xF00BA3L, 0, 4, 4)
);
var segments = ReversePreindexWordSegments.construct(reader, sortingContext, wordsIdFile, countsFile);
var docs = ReversePreindexDocuments.construct(docsFile, reader, DocIdRewriter.identity(), sortingContext, segments);
var segments = ReversePreindexWordSegments.construct(reader, wordsIdFile, countsFile);
var docs = ReversePreindexDocuments.construct(docsFile, reader, DocIdRewriter.identity(), segments);
List<TestSegmentData> expected = List.of(
new TestSegmentData(4, 0, 4, new long[] { -0xF00BA3L, 0, -0xF00BA3L, 0 })
@ -110,8 +108,8 @@ class ReversePreindexDocsTest {
new EntryData(0xF00BA4L, 0, 15, 30, -100, 33)
);
var segments = ReversePreindexWordSegments.construct(reader, sortingContext, wordsIdFile, countsFile);
var docs = ReversePreindexDocuments.construct(docsFile, reader, DocIdRewriter.identity(), sortingContext, segments);
var segments = ReversePreindexWordSegments.construct(reader, wordsIdFile, countsFile);
var docs = ReversePreindexDocuments.construct(docsFile, reader, DocIdRewriter.identity(), segments);
List<TestSegmentData> expected = List.of(
new TestSegmentData(-100, 0, 4, new long[] { -0xF00BA3L, 0, 0xF00BA4L, 0 }),

View File

@ -54,7 +54,7 @@ class ReversePreindexFinalizeTest {
@Test
public void testFinalizeSimple() throws IOException {
var reader = journalFactory.createReader(new EntryDataWithWordMeta(100, 101, wm(50, 51)));
var preindex = ReversePreindex.constructPreindex(reader, DocIdRewriter.identity(), tempDir, tempDir);
var preindex = ReversePreindex.constructPreindex(reader, DocIdRewriter.identity(), tempDir);
preindex.finalizeIndex(tempDir.resolve( "docs.dat"), tempDir.resolve("words.dat"));
@ -92,7 +92,7 @@ class ReversePreindexFinalizeTest {
new EntryDataWithWordMeta(101, 101, wm(51, 52))
);
var preindex = ReversePreindex.constructPreindex(reader, DocIdRewriter.identity(), tempDir, tempDir);
var preindex = ReversePreindex.constructPreindex(reader, DocIdRewriter.identity(), tempDir);
preindex.finalizeIndex(tempDir.resolve( "docs.dat"), tempDir.resolve("words.dat"));
preindex.delete();

View File

@ -54,8 +54,8 @@ class ReversePreindexMergeTest {
var reader1 = journalFactory.createReader(leftData.toArray(EntryDataWithWordMeta[]::new));
var reader2 = journalFactory.createReader(rightData.toArray(EntryDataWithWordMeta[]::new));
var left = ReversePreindex.constructPreindex(reader1, DocIdRewriter.identity(), tempDir, tempDir);
var right = ReversePreindex.constructPreindex(reader2, DocIdRewriter.identity(), tempDir, tempDir);
var left = ReversePreindex.constructPreindex(reader1, DocIdRewriter.identity(), tempDir);
var right = ReversePreindex.constructPreindex(reader2, DocIdRewriter.identity(), tempDir);
return ReversePreindex.merge(tempDir, left, right);
}

View File

@ -22,7 +22,6 @@ class ReversePreindexWordSegmentsTest {
Path tempDir;
TestJournalFactory journalFactory;
SortingContext sortingContext;
@BeforeEach
public void setUp() throws IOException {
@ -32,7 +31,6 @@ class ReversePreindexWordSegmentsTest {
wordsIdFile = Files.createTempFile("words", ".dat");
docsFile = Files.createTempFile("docs", ".dat");
tempDir = Files.createTempDirectory("sort");
sortingContext = new SortingContext(Path.of("invalid"), 1<<20);
}
@AfterEach
@ -54,7 +52,7 @@ class ReversePreindexWordSegmentsTest {
new EntryData(-0xF00BA3L, 0, 1L<<33)
);
var segments = ReversePreindexWordSegments.construct(reader, sortingContext, wordsIdFile, countsFile);
var segments = ReversePreindexWordSegments.construct(reader, wordsIdFile, countsFile);
var iter = segments.iterator(1);
List<TestSegmentData> expected = List.of(
@ -75,7 +73,7 @@ class ReversePreindexWordSegmentsTest {
new EntryData(-0xF00BA3L, 0, 5, 5)
);
var segments = ReversePreindexWordSegments.construct(reader, sortingContext, wordsIdFile, countsFile);
var segments = ReversePreindexWordSegments.construct(reader, wordsIdFile, countsFile);
var iter = segments.iterator(1);
List<TestSegmentData> expected = List.of(
@ -97,7 +95,7 @@ class ReversePreindexWordSegmentsTest {
new EntryData(-0xF00BA3L, 0, 10, 40, -100, 33)
);
var segments = ReversePreindexWordSegments.construct(reader, sortingContext, wordsIdFile, countsFile);
var segments = ReversePreindexWordSegments.construct(reader, wordsIdFile, countsFile);
var iter = segments.iterator(1);
List<TestSegmentData> expected = List.of(
@ -123,7 +121,7 @@ class ReversePreindexWordSegmentsTest {
new EntryData(0xF00BA4L, 0, 15, 30, -100, 33)
);
var segments = ReversePreindexWordSegments.construct(reader, sortingContext, wordsIdFile, countsFile);
var segments = ReversePreindexWordSegments.construct(reader, wordsIdFile, countsFile);
var iter = segments.iterator(1);
List<TestSegmentData> expected = List.of(