diff --git a/code/features-index/index-reverse/index.svg b/code/features-index/index-reverse/index.svg new file mode 100644 index 00000000..8c0184ea --- /dev/null +++ b/code/features-index/index-reverse/index.svg @@ -0,0 +1,4 @@ + + + + \ No newline at end of file diff --git a/code/features-index/index-reverse/merging.svg b/code/features-index/index-reverse/merging.svg new file mode 100644 index 00000000..ed023d52 --- /dev/null +++ b/code/features-index/index-reverse/merging.svg @@ -0,0 +1,4 @@ + + + + \ No newline at end of file diff --git a/code/features-index/index-reverse/preindex.svg b/code/features-index/index-reverse/preindex.svg new file mode 100644 index 00000000..456f56a4 --- /dev/null +++ b/code/features-index/index-reverse/preindex.svg @@ -0,0 +1,4 @@ + + + + \ No newline at end of file diff --git a/code/features-index/index-reverse/readme.md b/code/features-index/index-reverse/readme.md index 5a9db1e5..a27371d6 100644 --- a/code/features-index/index-reverse/readme.md +++ b/code/features-index/index-reverse/readme.md @@ -12,9 +12,35 @@ The full index also provides access to term-level metadata, while the priority i [1] See WordFlags in [common/model](../../common/model/) and KeywordMetadata in [features-convert/keyword-extraction](../../features-convert/keyword-extraction). +## Construction + +The reverse index is constructed by first building a series of preindexes. +Preindexes consist of a Segment and a Documents object. The segment contains +information about which word identifiers are present and how many, and the +documents contain information about in which documents the words can be found. + + + +These would typically not fit in RAM, so the index journal is paged +and the preindexes are constructed small enough to fit in memory, and +then merged. Merging sorted arrays is a very fast operation that does +not require additional RAM. + + + +Once merged into one large preindex, indexes are added to the preindex data +to form a finalized reverse index. + + ## Central Classes -* [ReverseIndexFullConverter](src/main/java/nu/marginalia/index/full/ReverseIndexFullConverter.java) constructs the full index. -* [ReverseIndexFullReader](src/main/java/nu/marginalia/index/full/ReverseIndexFullReader.java) interrogates the full index. -* [ReverseIndexPriorityConverter](src/main/java/nu/marginalia/index/priority/ReverseIndexPriorityConverter.java) constructs the priority index. -* [ReverseIndexPriorityReader](src/main/java/nu/marginalia/index/priority/ReverseIndexPriorityReader.java) interrogates the priority index. +* [ReversePreindex](src/main/java/nu/marginalia/index/construction/ReversePreindex.java) intermediate reverse index state. +* [ReverseIndexConstructor](src/main/java/nu/marginalia/index/construction/ReverseIndexConstructor.java) constructs the index. +* [ReverseIndexReader](src/main/java/nu/marginalia/index/ReverseIndexReader.java) interrogates the index. + +## See Also + +* [index-journal](../index-journal) +* [index-forward](../index-forward) +* [libraries/btree](../../libraries/btree) +* [libraries/array](../../libraries/array) \ No newline at end of file diff --git a/code/features-index/index-reverse/src/main/java/nu/marginalia/index/construction/ReverseIndexConstructor.java b/code/features-index/index-reverse/src/main/java/nu/marginalia/index/construction/ReverseIndexConstructor.java index 91e6e60b..c7e42e98 100644 --- a/code/features-index/index-reverse/src/main/java/nu/marginalia/index/construction/ReverseIndexConstructor.java +++ b/code/features-index/index-reverse/src/main/java/nu/marginalia/index/construction/ReverseIndexConstructor.java @@ -32,7 +32,7 @@ public class ReverseIndexConstructor { for (var input : inputs) { logger.info("Construcing preindex from {}", input); - var preindex = ReversePreindex.constructPreindex(readerSource.construct(input), docIdRewriter, tmpDir, tmpDir); + var preindex = ReversePreindex.constructPreindex(readerSource.construct(input), docIdRewriter, tmpDir); preindexes.add(preindex); } diff --git a/code/features-index/index-reverse/src/main/java/nu/marginalia/index/construction/ReversePreindex.java b/code/features-index/index-reverse/src/main/java/nu/marginalia/index/construction/ReversePreindex.java index 19d3ad99..284f7df7 100644 --- a/code/features-index/index-reverse/src/main/java/nu/marginalia/index/construction/ReversePreindex.java +++ b/code/features-index/index-reverse/src/main/java/nu/marginalia/index/construction/ReversePreindex.java @@ -16,9 +16,18 @@ import java.nio.file.StandardOpenOption; import static nu.marginalia.array.algo.TwoArrayOperations.*; +/** Contains the data that would go into a reverse index, + * that is, a mapping from words to documents, minus the actual + * index structure that makes the data quick to access while + * searching. + *
+ * Two preindexes can be merged into a third preindex containing
+ * the union of their data. This operation requires no additional
+ * RAM.
+ */
public class ReversePreindex {
- public final ReversePreindexWordSegments segments;
- public final ReversePreindexDocuments documents;
+ final ReversePreindexWordSegments segments;
+ final ReversePreindexDocuments documents;
private static final Logger logger = LoggerFactory.getLogger(ReversePreindex.class);
@@ -27,6 +36,26 @@ public class ReversePreindex {
this.documents = documents;
}
+ /** Constructs a new preindex with the data associated with reader. The backing files
+ * will have randomly assigned names.
+ */
+ public static ReversePreindex constructPreindex(IndexJournalReader reader,
+ DocIdRewriter docIdRewriter,
+ Path destDir) throws IOException
+ {
+ Path segmentWordsFile = Files.createTempFile(destDir, "segment_words", ".dat");
+ Path segmentCountsFile = Files.createTempFile(destDir, "segment_counts", ".dat");
+ Path docsFile = Files.createTempFile(destDir, "docs", ".dat");
+
+ logger.info("Segmenting");
+ var segments = ReversePreindexWordSegments.construct(reader, segmentWordsFile, segmentCountsFile);
+ logger.info("Mapping docs");
+ var docs = ReversePreindexDocuments.construct(docsFile, reader, docIdRewriter, segments);
+ logger.info("Done");
+ return new ReversePreindex(segments, docs);
+ }
+
+ /** Transform the preindex into a reverse index */
public void finalizeIndex(Path outputFileDocs, Path outputFileWords) throws IOException {
var offsets = segments.counts;
@@ -72,30 +101,87 @@ public class ReversePreindex {
segments.delete();
documents.delete();
}
- public static ReversePreindex constructPreindex(IndexJournalReader reader,
- DocIdRewriter docIdRewriter,
- Path tempDir,
- Path destDir) throws IOException
- {
- Path segmentWordsFile = Files.createTempFile(destDir, "segment_words", ".dat");
- Path segmentCountsFile = Files.createTempFile(destDir, "segment_counts", ".dat");
+
+ public static ReversePreindex merge(Path destDir,
+ ReversePreindex left,
+ ReversePreindex right) throws IOException {
+
+ ReversePreindexWordSegments mergingSegment =
+ createMergedSegmentWordFile(destDir, left.segments, right.segments);
+
+ var mergingIter = mergingSegment.constructionIterator(2);
+ var leftIter = left.segments.iterator(2);
+ var rightIter = right.segments.iterator(2);
+
Path docsFile = Files.createTempFile(destDir, "docs", ".dat");
- SortingContext ctx = new SortingContext(tempDir, 1<<31);
- logger.info("Segmenting");
- var segments = ReversePreindexWordSegments.construct(reader, ctx, segmentWordsFile, segmentCountsFile);
- logger.info("Mapping docs");
- var docs = ReversePreindexDocuments.construct(docsFile, reader, docIdRewriter, ctx, segments);
- logger.info("Done");
- return new ReversePreindex(segments, docs);
+ LongArray mergedDocuments = LongArray.mmapForWriting(docsFile, 8 * (left.documents.size() + right.documents.size()));
+
+ leftIter.next();
+ rightIter.next();
+
+ try (FileChannel leftChannel = left.documents.createDocumentsFileChannel();
+ FileChannel rightChannel = right.documents.createDocumentsFileChannel())
+ {
+
+ while (mergingIter.canPutMore()
+ && leftIter.isPositionBeforeEnd()
+ && rightIter.isPositionBeforeEnd())
+ {
+ final long currentWord = mergingIter.wordId;
+
+ if (leftIter.wordId == currentWord && rightIter.wordId == currentWord)
+ {
+ // both inputs have documents for the current word
+ mergeSegments(leftIter, rightIter,
+ left.documents, right.documents,
+ mergedDocuments, mergingIter);
+ }
+ else if (leftIter.wordId == currentWord) {
+ if (!copySegment(leftIter, mergedDocuments, leftChannel, mergingIter))
+ break;
+ }
+ else if (rightIter.wordId == currentWord) {
+ if (!copySegment(rightIter, mergedDocuments, rightChannel, mergingIter))
+ break;
+ }
+ else assert false : "This should never happen"; // the helvetica scenario
+ }
+
+ if (leftIter.isPositionBeforeEnd()) {
+ while (copySegment(leftIter, mergedDocuments, leftChannel, mergingIter));
+ }
+
+ if (rightIter.isPositionBeforeEnd()) {
+ while (copySegment(rightIter, mergedDocuments, rightChannel, mergingIter));
+ }
+
+ }
+
+ assert !leftIter.isPositionBeforeEnd() : "Left has more to go";
+ assert !rightIter.isPositionBeforeEnd() : "Right has more to go";
+ assert !mergingIter.canPutMore() : "Source iters ran dry before merging iter";
+
+ // We may have overestimated the size of the merged docs size in the case there were
+ // duplicates in the data, so we need to shrink it to the actual size we wrote.
+
+ mergedDocuments = shrinkMergedDocuments(mergedDocuments,
+ docsFile, 2 * mergingSegment.totalSize());
+
+ mergingSegment.force();
+
+ return new ReversePreindex(
+ mergingSegment,
+ new ReversePreindexDocuments(mergedDocuments, docsFile)
+ );
}
/** Create a segment word file with each word from both inputs, with zero counts for all the data.
* This is an intermediate product in merging.
*/
static ReversePreindexWordSegments createMergedSegmentWordFile(Path destDir,
- ReversePreindexWordSegments left,
- ReversePreindexWordSegments right) throws IOException {
+ ReversePreindexWordSegments left,
+ ReversePreindexWordSegments right) throws IOException {
Path segmentWordsFile = Files.createTempFile(destDir, "segment_words", ".dat");
Path segmentCountsFile = Files.createTempFile(destDir, "segment_counts", ".dat");
@@ -114,79 +200,10 @@ public class ReversePreindex {
return new ReversePreindexWordSegments(wordIdsFile, counts, segmentWordsFile, segmentCountsFile);
}
- public static ReversePreindex merge(Path destDir,
- ReversePreindex left,
- ReversePreindex right) throws IOException {
-
- ReversePreindexWordSegments mergingSegment = createMergedSegmentWordFile(destDir,
- left.segments,
- right.segments);
-
- var mergingIter = mergingSegment.constructionIterator(2);
- var leftIter = left.segments.iterator(2);
- var rightIter = right.segments.iterator(2);
-
- Path docsFile = Files.createTempFile(destDir, "docs", ".dat");
-
- LongArray mergedDocuments = LongArray.mmapForWriting(docsFile, 8 * (left.documents.size() + right.documents.size()));
-
- leftIter.next();
- rightIter.next();
-
- FileChannel leftChannel = left.documents.createDocumentsFileChannel();
- FileChannel rightChannel = right.documents.createDocumentsFileChannel();
-
- while (mergingIter.canPutMore()
- && leftIter.isPositionBeforeEnd()
- && rightIter.isPositionBeforeEnd())
- {
- if (leftIter.wordId == mergingIter.wordId
- && rightIter.wordId == mergingIter.wordId) {
- mergeSegments(leftIter,
- rightIter,
- left.documents,
- right.documents,
- mergedDocuments,
- mergingIter);
- }
- else if (leftIter.wordId == mergingIter.wordId) {
- if (!copySegment(leftIter, mergedDocuments, leftChannel, mergingIter))
- break;
- }
- else if (rightIter.wordId == mergingIter.wordId) {
- if (!copySegment(rightIter, mergedDocuments, rightChannel, mergingIter))
- break;
- }
- else {
- assert false : "This should never happen";
- }
- }
-
- if (leftIter.isPositionBeforeEnd()) {
- while (copySegment(leftIter, mergedDocuments, leftChannel, mergingIter));
-
- }
- if (rightIter.isPositionBeforeEnd()) {
- while (copySegment(rightIter, mergedDocuments, rightChannel, mergingIter));
- }
-
- assert !leftIter.isPositionBeforeEnd() : "Left has more to go";
- assert !rightIter.isPositionBeforeEnd() : "Right has more to go";
- assert !mergingIter.canPutMore() : "Source iters ran dry before merging iter";
-
- // We may have overestimated the size of the merged docs size in the case there were
- // duplicates in the data, so we need to shrink it to the actual size we wrote.
-
- mergedDocuments = shrinkMergedDocuments(mergedDocuments, docsFile, 2 * mergingSegment.totalSize());
-
- mergingSegment.force();
-
- return new ReversePreindex(
- mergingSegment,
- new ReversePreindexDocuments(mergedDocuments, docsFile)
- );
- }
+ /** It's possible we overestimated the necessary size of the documents file,
+ * this will permit us to shrink it down to the smallest necessary size.
+ */
private static LongArray shrinkMergedDocuments(LongArray mergedDocuments, Path docsFile, long sizeLongs) throws IOException {
mergedDocuments.force();
@@ -205,12 +222,15 @@ public class ReversePreindex {
return mergedDocuments;
}
+ /** Merge contents of the segments indicated by leftIter and rightIter into the destionation
+ * segment, and advance the construction iterator with the appropriate size.
+ */
private static void mergeSegments(ReversePreindexWordSegments.SegmentIterator leftIter,
ReversePreindexWordSegments.SegmentIterator rightIter,
ReversePreindexDocuments left,
ReversePreindexDocuments right,
- LongArray documentsFile,
- ReversePreindexWordSegments.SegmentConstructionIterator mergingIter)
+ LongArray dest,
+ ReversePreindexWordSegments.SegmentConstructionIterator destIter)
{
long distinct = countDistinctElementsN(2,
left.documents,
@@ -218,29 +238,32 @@ public class ReversePreindex {
leftIter.startOffset, leftIter.endOffset,
rightIter.startOffset, rightIter.endOffset);
- mergeArrays2(documentsFile,
+ mergeArrays2(dest,
left.documents,
right.documents,
- mergingIter.startOffset,
- mergingIter.startOffset + 2*distinct,
+ destIter.startOffset,
+ destIter.startOffset + 2*distinct,
leftIter.startOffset, leftIter.endOffset,
rightIter.startOffset, rightIter.endOffset);
- mergingIter.putNext(distinct);
+ destIter.putNext(distinct);
leftIter.next();
rightIter.next();
}
+ /** Copy the data from the source segment at the position and length indicated by sourceIter,
+ * into the destination segment, and advance the construction iterator.
+ */
private static boolean copySegment(ReversePreindexWordSegments.SegmentIterator sourceIter,
- LongArray documentsFile,
- FileChannel leftChannel,
+ LongArray dest,
+ FileChannel sourceChannel,
ReversePreindexWordSegments.SegmentConstructionIterator mergingIter) throws IOException {
long size = sourceIter.endOffset - sourceIter.startOffset;
long start = mergingIter.startOffset;
long end = start + size;
- documentsFile.transferFrom(leftChannel,
+ dest.transferFrom(sourceChannel,
sourceIter.startOffset,
mergingIter.startOffset,
end);
@@ -248,12 +271,9 @@ public class ReversePreindex {
boolean putNext = mergingIter.putNext(size / 2);
boolean iterNext = sourceIter.next();
- if (!putNext) {
- assert !iterNext: "Source iterator ran out before dest iterator?!";
- }
+ assert putNext || !iterNext : "Source iterator ran out before dest iterator?!";
return iterNext;
-
}
diff --git a/code/features-index/index-reverse/src/main/java/nu/marginalia/index/construction/ReversePreindexDocuments.java b/code/features-index/index-reverse/src/main/java/nu/marginalia/index/construction/ReversePreindexDocuments.java
index 4f5d0c61..c51a977d 100644
--- a/code/features-index/index-reverse/src/main/java/nu/marginalia/index/construction/ReversePreindexDocuments.java
+++ b/code/features-index/index-reverse/src/main/java/nu/marginalia/index/construction/ReversePreindexDocuments.java
@@ -34,7 +34,6 @@ public class ReversePreindexDocuments {
Path docsFile,
IndexJournalReader reader,
DocIdRewriter docIdRewriter,
- SortingContext sortingContext,
ReversePreindexWordSegments segments) throws IOException {
@@ -43,7 +42,7 @@ public class ReversePreindexDocuments {
LongArray docsFileMap = LongArray.mmapForWriting(docsFile, 8 * Files.size(docsFile));
logger.info("Sorting data");
- sortDocsFile(docsFileMap, segments, sortingContext);
+ sortDocsFile(docsFileMap, segments);
return new ReversePreindexDocuments(docsFileMap, docsFile);
}
@@ -90,7 +89,7 @@ public class ReversePreindexDocuments {
}
@SneakyThrows
- private static void sortDocsFile(LongArray docsFileMap, ReversePreindexWordSegments segments, SortingContext sortingContext) throws IOException {
+ private static void sortDocsFile(LongArray docsFileMap, ReversePreindexWordSegments segments) throws IOException {
var iter = segments.iterator(RECORD_SIZE_LONGS);
diff --git a/code/features-index/index-reverse/src/main/java/nu/marginalia/index/construction/ReversePreindexWordSegments.java b/code/features-index/index-reverse/src/main/java/nu/marginalia/index/construction/ReversePreindexWordSegments.java
index 5a0e8f2d..5acd2219 100644
--- a/code/features-index/index-reverse/src/main/java/nu/marginalia/index/construction/ReversePreindexWordSegments.java
+++ b/code/features-index/index-reverse/src/main/java/nu/marginalia/index/construction/ReversePreindexWordSegments.java
@@ -51,7 +51,6 @@ public class ReversePreindexWordSegments {
}
public static ReversePreindexWordSegments construct(IndexJournalReader reader,
- SortingContext ctx,
Path wordIdsFile,
Path countsFile)
throws IOException
@@ -73,7 +72,7 @@ public class ReversePreindexWordSegments {
}
// Sort the words file
- words.sortLargeSpan(ctx, 0, counts.size());
+ words.quickSort(0, counts.size());
// Populate the counts
for (i = 0; i < countsMap.size(); i++) {
diff --git a/code/features-index/index-reverse/src/test/java/nu/marginalia/index/ReverseIndexReaderTest.java b/code/features-index/index-reverse/src/test/java/nu/marginalia/index/ReverseIndexReaderTest.java
index 3963fd2d..e05fdf78 100644
--- a/code/features-index/index-reverse/src/test/java/nu/marginalia/index/ReverseIndexReaderTest.java
+++ b/code/features-index/index-reverse/src/test/java/nu/marginalia/index/ReverseIndexReaderTest.java
@@ -94,7 +94,7 @@ class ReverseIndexReaderTest {
private ReverseIndexReader createIndex(EntryDataWithWordMeta... scenario) throws IOException {
var reader = journalFactory.createReader(scenario);
- var preindex = ReversePreindex.constructPreindex(reader, DocIdRewriter.identity(), tempDir, tempDir);
+ var preindex = ReversePreindex.constructPreindex(reader, DocIdRewriter.identity(), tempDir);
Path docsFile = tempDir.resolve("docs.dat");
diff --git a/code/features-index/index-reverse/src/test/java/nu/marginalia/index/construction/ReversePreindexDocsTest.java b/code/features-index/index-reverse/src/test/java/nu/marginalia/index/construction/ReversePreindexDocsTest.java
index 517c1ae6..6d3b7bf4 100644
--- a/code/features-index/index-reverse/src/test/java/nu/marginalia/index/construction/ReversePreindexDocsTest.java
+++ b/code/features-index/index-reverse/src/test/java/nu/marginalia/index/construction/ReversePreindexDocsTest.java
@@ -20,7 +20,6 @@ class ReversePreindexDocsTest {
Path wordsIdFile;
Path docsFile;
Path tempDir;
- SortingContext sortingContext;
TestJournalFactory journalFactory;
@@ -32,7 +31,6 @@ class ReversePreindexDocsTest {
wordsIdFile = Files.createTempFile("words", ".dat");
docsFile = Files.createTempFile("docs", ".dat");
tempDir = Files.createTempDirectory("sort");
- sortingContext = new SortingContext(Path.of("invalid"), 1<<20);
}
@AfterEach
@@ -55,8 +53,8 @@ class ReversePreindexDocsTest {
new EntryData(-0xF00BA3L, 0, 10, 40, -100, 33)
);
- var segments = ReversePreindexWordSegments.construct(reader, sortingContext, wordsIdFile, countsFile);
- var docs = ReversePreindexDocuments.construct(docsFile, reader, DocIdRewriter.identity(), sortingContext, segments);
+ var segments = ReversePreindexWordSegments.construct(reader, wordsIdFile, countsFile);
+ var docs = ReversePreindexDocuments.construct(docsFile, reader, DocIdRewriter.identity(), segments);
List