From a8bec13ed99aeded8027c43d18bee7d11e2e7851 Mon Sep 17 00:00:00 2001
From: Viktor Lofgren <vlofgren@marginalia.nu>
Date: Fri, 13 Sep 2024 16:14:56 +0200
Subject: [PATCH] (index) Evaluate using mmap reads during index construction
 in favor of filechannel reads

It's likely that this will be faster, as the reads are on average small and sequential, and can't be buffered easily.
---
 .../full/FullIndexBTreeTransformer.java       | 10 ++-
 .../index/construction/full/FullPreindex.java | 72 ++++++++-----------
 .../index/construction/prio/PrioPreindex.java | 68 +++++++++---------
 .../marginalia/array/algo/LongArrayBase.java  |  1 +
 .../array/page/SegmentLongArray.java          | 19 +++++
 .../array/page/UnsafeLongArray.java           | 19 +++++
 6 files changed, 107 insertions(+), 82 deletions(-)

diff --git a/code/index/index-reverse/java/nu/marginalia/index/construction/full/FullIndexBTreeTransformer.java b/code/index/index-reverse/java/nu/marginalia/index/construction/full/FullIndexBTreeTransformer.java
index ccf21331..0af6165e 100644
--- a/code/index/index-reverse/java/nu/marginalia/index/construction/full/FullIndexBTreeTransformer.java
+++ b/code/index/index-reverse/java/nu/marginalia/index/construction/full/FullIndexBTreeTransformer.java
@@ -6,14 +6,12 @@ import nu.marginalia.btree.BTreeWriter;
 import nu.marginalia.btree.model.BTreeContext;
 
 import java.io.IOException;
-import java.nio.channels.FileChannel;
 
 /** Constructs the BTrees in a reverse index */
 public class FullIndexBTreeTransformer implements LongArrayTransformations.LongIOTransformer {
     private final BTreeWriter writer;
-    private final FileChannel intermediateChannel;
-
     private final int entrySize;
+    private final LongArray documentsArray;
 
     long start = 0;
     long writeOffset = 0;
@@ -21,10 +19,10 @@ public class FullIndexBTreeTransformer implements LongArrayTransformations.LongI
     public FullIndexBTreeTransformer(LongArray urlsFileMap,
                                      int entrySize,
                                      BTreeContext bTreeContext,
-                                     FileChannel intermediateChannel) {
+                                     LongArray documentsArray) {
+        this.documentsArray = documentsArray;
         this.writer = new BTreeWriter(urlsFileMap, bTreeContext);
         this.entrySize = entrySize;
-        this.intermediateChannel = intermediateChannel;
     }
 
     @Override
@@ -39,7 +37,7 @@ public class FullIndexBTreeTransformer implements LongArrayTransformations.LongI
         final long offsetForBlock = writeOffset;
 
         writeOffset += writer.write(writeOffset, size,
-                mapRegion -> mapRegion.transferFrom(intermediateChannel, start, 0, end - start)
+                mapRegion -> mapRegion.transferFrom(documentsArray, start, 0, end - start)
         );
 
         start = end;
diff --git a/code/index/index-reverse/java/nu/marginalia/index/construction/full/FullPreindex.java b/code/index/index-reverse/java/nu/marginalia/index/construction/full/FullPreindex.java
index 50f3a4bb..4774519e 100644
--- a/code/index/index-reverse/java/nu/marginalia/index/construction/full/FullPreindex.java
+++ b/code/index/index-reverse/java/nu/marginalia/index/construction/full/FullPreindex.java
@@ -13,7 +13,6 @@ import org.slf4j.Logger;
 import org.slf4j.LoggerFactory;
 
 import java.io.IOException;
-import java.nio.channels.FileChannel;
 import java.nio.file.Files;
 import java.nio.file.Path;
 import java.nio.file.StandardOpenOption;
@@ -87,13 +86,10 @@ public class FullPreindex {
 
         // Write the docs file
         LongArray finalDocs = LongArrayFactory.mmapForWritingConfined(outputFileDocs, sizeEstimator.size);
-        try (var intermediateDocChannel = documents.createDocumentsFileChannel()) {
-            offsets.transformEachIO(0, offsets.size(),
-                    new FullIndexBTreeTransformer(finalDocs, 2,
-                            ReverseIndexParameters.fullDocsBTreeContext,
-                            intermediateDocChannel));
-            intermediateDocChannel.force(false);
-        }
+        offsets.transformEachIO(0, offsets.size(),
+                new FullIndexBTreeTransformer(finalDocs, 2,
+                        ReverseIndexParameters.fullDocsBTreeContext,
+                        documents.documents));
 
         LongArray wordIds = segments.wordIds;
 
@@ -148,42 +144,36 @@ public class FullPreindex {
         leftIter.next();
         rightIter.next();
 
-        try (FileChannel leftChannel = left.documents.createDocumentsFileChannel();
-             FileChannel rightChannel = right.documents.createDocumentsFileChannel())
+        while (mergingIter.canPutMore()
+                && leftIter.isPositionBeforeEnd()
+                && rightIter.isPositionBeforeEnd())
         {
+            final long currentWord = mergingIter.wordId;
 
-            while (mergingIter.canPutMore()
-                    && leftIter.isPositionBeforeEnd()
-                    && rightIter.isPositionBeforeEnd())
+            if (leftIter.wordId == currentWord && rightIter.wordId == currentWord)
             {
-                final long currentWord = mergingIter.wordId;
-
-                if (leftIter.wordId == currentWord && rightIter.wordId == currentWord)
-                {
-                    // both inputs have documents for the current word
-                    mergeSegments(leftIter, rightIter,
-                            left.documents, right.documents,
-                            mergedDocuments, mergingIter);
-                }
-                else if (leftIter.wordId == currentWord) {
-                    if (!copySegment(leftIter, mergedDocuments, leftChannel, mergingIter))
-                        break;
-                }
-                else if (rightIter.wordId == currentWord) {
-                    if (!copySegment(rightIter, mergedDocuments, rightChannel, mergingIter))
-                        break;
-                }
-                else assert false : "This should never happen"; // the helvetica scenario
+                // both inputs have documents for the current word
+                mergeSegments(leftIter, rightIter,
+                        left.documents, right.documents,
+                        mergedDocuments, mergingIter);
             }
-
-            if (leftIter.isPositionBeforeEnd()) {
-                while (copySegment(leftIter, mergedDocuments, leftChannel, mergingIter));
+            else if (leftIter.wordId == currentWord) {
+                if (!copySegment(leftIter, left.documents,  mergingIter, mergedDocuments))
+                    break;
             }
-
-            if (rightIter.isPositionBeforeEnd()) {
-                while (copySegment(rightIter, mergedDocuments, rightChannel, mergingIter));
+            else if (rightIter.wordId == currentWord) {
+                if (!copySegment(rightIter, right.documents,  mergingIter, mergedDocuments))
+                    break;
             }
+            else assert false : "This should never happen"; // the helvetica scenario
+        }
 
+        if (leftIter.isPositionBeforeEnd()) {
+            while (copySegment(leftIter, left.documents,  mergingIter, mergedDocuments));
+        }
+
+        if (rightIter.isPositionBeforeEnd()) {
+            while (copySegment(rightIter, right.documents,  mergingIter, mergedDocuments));
         }
 
         if (leftIter.isPositionBeforeEnd())
@@ -284,15 +274,15 @@ public class FullPreindex {
      * into the destination segment, and advance the construction iterator.
      */
     private static boolean copySegment(FullPreindexWordSegments.SegmentIterator sourceIter,
-                                       LongArray dest,
-                                       FileChannel sourceChannel,
-                                       FullPreindexWordSegments.SegmentConstructionIterator mergingIter) throws IOException {
+                                       FullPreindexDocuments srcDocuments,
+                                       FullPreindexWordSegments.SegmentConstructionIterator mergingIter,
+                                       LongArray dest) throws IOException {
 
         long size = sourceIter.endOffset - sourceIter.startOffset;
         long start = mergingIter.startOffset;
         long end = start + size;
 
-        dest.transferFrom(sourceChannel,
+        dest.transferFrom(srcDocuments.documents,
                 sourceIter.startOffset,
                 mergingIter.startOffset,
                 end);
diff --git a/code/index/index-reverse/java/nu/marginalia/index/construction/prio/PrioPreindex.java b/code/index/index-reverse/java/nu/marginalia/index/construction/prio/PrioPreindex.java
index ee1ab3ac..e0a8db92 100644
--- a/code/index/index-reverse/java/nu/marginalia/index/construction/prio/PrioPreindex.java
+++ b/code/index/index-reverse/java/nu/marginalia/index/construction/prio/PrioPreindex.java
@@ -139,44 +139,39 @@ public class PrioPreindex {
         leftIter.next();
         rightIter.next();
 
-        try (FileChannel leftChannel = left.documents.createDocumentsFileChannel();
-             FileChannel rightChannel = right.documents.createDocumentsFileChannel())
+        while (mergingIter.canPutMore()
+                && leftIter.isPositionBeforeEnd()
+                && rightIter.isPositionBeforeEnd())
         {
+            final long currentWord = mergingIter.wordId;
 
-            while (mergingIter.canPutMore()
-                    && leftIter.isPositionBeforeEnd()
-                    && rightIter.isPositionBeforeEnd())
+            if (leftIter.wordId == currentWord && rightIter.wordId == currentWord)
             {
-                final long currentWord = mergingIter.wordId;
-
-                if (leftIter.wordId == currentWord && rightIter.wordId == currentWord)
-                {
-                    // both inputs have documents for the current word
-                    mergeSegments(leftIter, rightIter,
-                            left.documents, right.documents,
-                            mergedDocuments, mergingIter);
-                }
-                else if (leftIter.wordId == currentWord) {
-                    if (!copySegment(leftIter, mergedDocuments, leftChannel, mergingIter))
-                        break;
-                }
-                else if (rightIter.wordId == currentWord) {
-                    if (!copySegment(rightIter, mergedDocuments, rightChannel, mergingIter))
-                        break;
-                }
-                else assert false : "This should never happen"; // the helvetica scenario
+                // both inputs have documents for the current word
+                mergeSegments(leftIter, rightIter,
+                        left.documents, right.documents,
+                        mergedDocuments, mergingIter);
             }
-
-            if (leftIter.isPositionBeforeEnd()) {
-                while (copySegment(leftIter, mergedDocuments, leftChannel, mergingIter));
+            else if (leftIter.wordId == currentWord) {
+                if (!copySegment(leftIter, left.documents,  mergingIter, mergedDocuments))
+                    break;
             }
-
-            if (rightIter.isPositionBeforeEnd()) {
-                while (copySegment(rightIter, mergedDocuments, rightChannel, mergingIter));
+            else if (rightIter.wordId == currentWord) {
+                if (!copySegment(rightIter, right.documents,  mergingIter, mergedDocuments))
+                    break;
             }
-
+            else assert false : "This should never happen"; // the helvetica scenario
         }
 
+        if (leftIter.isPositionBeforeEnd()) {
+            while (copySegment(leftIter, left.documents,  mergingIter, mergedDocuments));
+        }
+
+        if (rightIter.isPositionBeforeEnd()) {
+            while (copySegment(rightIter, right.documents,  mergingIter, mergedDocuments));
+        }
+
+
         if (leftIter.isPositionBeforeEnd())
             throw new IllegalStateException("Left has more to go");
         if (rightIter.isPositionBeforeEnd())
@@ -270,24 +265,27 @@ public class PrioPreindex {
         rightIter.next();
     }
 
+    /** Copy the data from the source segment at the position and length indicated by sourceIter,
+     * into the destination segment, and advance the construction iterator.
+     */
     /** Copy the data from the source segment at the position and length indicated by sourceIter,
      * into the destination segment, and advance the construction iterator.
      */
     private static boolean copySegment(PrioPreindexWordSegments.SegmentIterator sourceIter,
-                                       LongArray dest,
-                                       FileChannel sourceChannel,
-                                       PrioPreindexWordSegments.SegmentConstructionIterator mergingIter) throws IOException {
+                                       PrioPreindexDocuments srcDocuments,
+                                       PrioPreindexWordSegments.SegmentConstructionIterator mergingIter,
+                                       LongArray dest) throws IOException {
 
         long size = sourceIter.endOffset - sourceIter.startOffset;
         long start = mergingIter.startOffset;
         long end = start + size;
 
-        dest.transferFrom(sourceChannel,
+        dest.transferFrom(srcDocuments.documents,
                 sourceIter.startOffset,
                 mergingIter.startOffset,
                 end);
 
-        boolean putNext = mergingIter.putNext(size);
+        boolean putNext = mergingIter.putNext(size / 2);
         boolean iterNext = sourceIter.next();
 
         if (!putNext && iterNext)
diff --git a/code/libraries/array/java/nu/marginalia/array/algo/LongArrayBase.java b/code/libraries/array/java/nu/marginalia/array/algo/LongArrayBase.java
index b5ef03da..5ce59973 100644
--- a/code/libraries/array/java/nu/marginalia/array/algo/LongArrayBase.java
+++ b/code/libraries/array/java/nu/marginalia/array/algo/LongArrayBase.java
@@ -108,4 +108,5 @@ public interface LongArrayBase extends BulkTransferArray<LongBuffer> {
     void write(Path file) throws IOException;
 
     void transferFrom(FileChannel source, long sourceStart, long arrayStart, long arrayEnd) throws IOException;
+    void transferFrom(LongArray source, long sourceStart, long arrayStart, long arrayEnd) throws IOException;
 }
diff --git a/code/libraries/array/java/nu/marginalia/array/page/SegmentLongArray.java b/code/libraries/array/java/nu/marginalia/array/page/SegmentLongArray.java
index ac420de9..5c63e5c3 100644
--- a/code/libraries/array/java/nu/marginalia/array/page/SegmentLongArray.java
+++ b/code/libraries/array/java/nu/marginalia/array/page/SegmentLongArray.java
@@ -167,6 +167,25 @@ public class SegmentLongArray implements LongArray {
         }
 
     }
+    
+    @Override
+    public void transferFrom(LongArray source,
+                             long sourceStartL,
+                             long destStartL,
+                             long destEndL)
+    {
+        if (destStartL > destEndL)
+            throw new IndexOutOfBoundsException("Source start after end");
+
+        if (sourceStartL + (destEndL - destStartL) > source.size())
+            throw new IndexOutOfBoundsException("Source array too small");
+        if (destEndL > size())
+            throw new IndexOutOfBoundsException("Destination array too small");
+
+        for (long i = destStartL; i < destEndL; i++) {
+            set(i, source.get(sourceStartL + i - destStartL));
+        }
+    }
 
     @Override
     public MemorySegment getMemorySegment() {
diff --git a/code/libraries/array/java/nu/marginalia/array/page/UnsafeLongArray.java b/code/libraries/array/java/nu/marginalia/array/page/UnsafeLongArray.java
index 04ea42d4..509fb829 100644
--- a/code/libraries/array/java/nu/marginalia/array/page/UnsafeLongArray.java
+++ b/code/libraries/array/java/nu/marginalia/array/page/UnsafeLongArray.java
@@ -269,4 +269,23 @@ public class UnsafeLongArray implements LongArray {
         }
     }
 
+    @Override
+    public void transferFrom(LongArray source,
+                             long sourceStartL,
+                             long destStartL,
+                             long destEndL)
+    {
+        if (destStartL > destEndL)
+            throw new IndexOutOfBoundsException("Source start after end");
+
+        if (sourceStartL + (destEndL - destStartL) > source.size())
+            throw new IndexOutOfBoundsException("Source array too small");
+        if (destEndL > size())
+            throw new IndexOutOfBoundsException("Destination array too small");
+
+        for (long i = destStartL; i < destEndL; i++) {
+            set(i, source.get(sourceStartL + i - destStartL));
+        }
+    }
+
 }