diff --git a/code/features-index/index-reverse/src/main/java/nu/marginalia/index/construction/ReversePreindex.java b/code/features-index/index-reverse/src/main/java/nu/marginalia/index/construction/ReversePreindex.java index d294bc37..22fc1431 100644 --- a/code/features-index/index-reverse/src/main/java/nu/marginalia/index/construction/ReversePreindex.java +++ b/code/features-index/index-reverse/src/main/java/nu/marginalia/index/construction/ReversePreindex.java @@ -41,14 +41,14 @@ public class ReversePreindex { */ public static ReversePreindex constructPreindex(IndexJournalReader reader, DocIdRewriter docIdRewriter, - Path destDir) throws IOException + Path workDir) throws IOException { - Path segmentWordsFile = Files.createTempFile(destDir, "segment_words", ".dat"); - Path segmentCountsFile = Files.createTempFile(destDir, "segment_counts", ".dat"); - Path docsFile = Files.createTempFile(destDir, "docs", ".dat"); + Path segmentWordsFile = Files.createTempFile(workDir, "segment_words", ".dat"); + Path segmentCountsFile = Files.createTempFile(workDir, "segment_counts", ".dat"); + Path docsFile = Files.createTempFile(workDir, "docs", ".dat"); var segments = ReversePreindexWordSegments.construct(reader, segmentWordsFile, segmentCountsFile); - var docs = ReversePreindexDocuments.construct(docsFile, reader, docIdRewriter, segments); + var docs = ReversePreindexDocuments.construct(docsFile, workDir, reader, docIdRewriter, segments); return new ReversePreindex(segments, docs); } diff --git a/code/features-index/index-reverse/src/main/java/nu/marginalia/index/construction/ReversePreindexDocuments.java b/code/features-index/index-reverse/src/main/java/nu/marginalia/index/construction/ReversePreindexDocuments.java index c2ab0a93..0f232577 100644 --- a/code/features-index/index-reverse/src/main/java/nu/marginalia/index/construction/ReversePreindexDocuments.java +++ b/code/features-index/index-reverse/src/main/java/nu/marginalia/index/construction/ReversePreindexDocuments.java @@ -4,6 +4,7 @@ import lombok.SneakyThrows; import nu.marginalia.array.LongArray; import nu.marginalia.array.LongArrayFactory; import nu.marginalia.index.journal.reader.IndexJournalReader; +import nu.marginalia.rwf.RandomFileAssembler; import org.slf4j.Logger; import org.slf4j.LoggerFactory; @@ -23,7 +24,7 @@ public class ReversePreindexDocuments { final Path file; public final LongArray documents; private static final int RECORD_SIZE_LONGS = 2; - private static final Logger logger= LoggerFactory.getLogger(ReversePreindexDocuments.class); + private static final Logger logger = LoggerFactory.getLogger(ReversePreindexDocuments.class); public ReversePreindexDocuments(LongArray documents, Path file) { this.documents = documents; @@ -32,11 +33,12 @@ public class ReversePreindexDocuments { public static ReversePreindexDocuments construct( Path docsFile, + Path workDir, IndexJournalReader reader, DocIdRewriter docIdRewriter, ReversePreindexWordSegments segments) throws IOException { - createUnsortedDocsFile(docsFile, reader, segments, docIdRewriter); + createUnsortedDocsFile(docsFile, workDir, reader, segments, docIdRewriter); LongArray docsFileMap = LongArrayFactory.mmapForModifyingShared(docsFile); sortDocsFile(docsFileMap, segments); @@ -58,12 +60,14 @@ public class ReversePreindexDocuments { } private static void createUnsortedDocsFile(Path docsFile, + Path workDir, IndexJournalReader reader, ReversePreindexWordSegments segments, DocIdRewriter docIdRewriter) throws IOException { - long fileSize = RECORD_SIZE_LONGS * segments.totalSize(); - try (LongArray outArray = LongArrayFactory.onHeapConfined(fileSize)) { + long fileSizeLongs = RECORD_SIZE_LONGS * segments.totalSize(); + + try (RandomFileAssembler assembly = RandomFileAssembler.create(workDir, fileSizeLongs)) { var offsetMap = segments.asMap(RECORD_SIZE_LONGS); offsetMap.defaultReturnValue(0); @@ -77,12 +81,12 @@ public class ReversePreindexDocuments { long offset = offsetMap.addTo(wordId, RECORD_SIZE_LONGS); - outArray.set(offset + 0, rankEncodedId); - outArray.set(offset + 1, wordMeta); + assembly.put(offset + 0, rankEncodedId); + assembly.put(offset + 1, wordMeta); } } - outArray.write(docsFile); + assembly.write(docsFile); } } diff --git a/code/features-index/index-reverse/src/test/java/nu/marginalia/index/construction/ReversePreindexDocsTest.java b/code/features-index/index-reverse/src/test/java/nu/marginalia/index/construction/ReversePreindexDocsTest.java index 6d3b7bf4..d6d81818 100644 --- a/code/features-index/index-reverse/src/test/java/nu/marginalia/index/construction/ReversePreindexDocsTest.java +++ b/code/features-index/index-reverse/src/test/java/nu/marginalia/index/construction/ReversePreindexDocsTest.java @@ -1,6 +1,5 @@ package nu.marginalia.index.construction; -import nu.marginalia.array.algo.SortingContext; import org.junit.jupiter.api.AfterEach; import org.junit.jupiter.api.BeforeEach; import org.junit.jupiter.api.Test; @@ -54,7 +53,7 @@ class ReversePreindexDocsTest { ); var segments = ReversePreindexWordSegments.construct(reader, wordsIdFile, countsFile); - var docs = ReversePreindexDocuments.construct(docsFile, reader, DocIdRewriter.identity(), segments); + var docs = ReversePreindexDocuments.construct(docsFile, tempDir, reader, DocIdRewriter.identity(), segments); List expected = List.of( new TestSegmentData(-100, 0, 2, new long[] { -0xF00BA3L, 0 }), @@ -83,7 +82,7 @@ class ReversePreindexDocsTest { ); var segments = ReversePreindexWordSegments.construct(reader, wordsIdFile, countsFile); - var docs = ReversePreindexDocuments.construct(docsFile, reader, DocIdRewriter.identity(), segments); + var docs = ReversePreindexDocuments.construct(docsFile, tempDir, reader, DocIdRewriter.identity(), segments); List expected = List.of( new TestSegmentData(4, 0, 4, new long[] { -0xF00BA3L, 0, -0xF00BA3L, 0 }) @@ -109,7 +108,7 @@ class ReversePreindexDocsTest { ); var segments = ReversePreindexWordSegments.construct(reader, wordsIdFile, countsFile); - var docs = ReversePreindexDocuments.construct(docsFile, reader, DocIdRewriter.identity(), segments); + var docs = ReversePreindexDocuments.construct(docsFile, tempDir, reader, DocIdRewriter.identity(), segments); List expected = List.of( new TestSegmentData(-100, 0, 4, new long[] { -0xF00BA3L, 0, 0xF00BA4L, 0 }), diff --git a/code/libraries/random-write-funnel/build.gradle b/code/libraries/random-write-funnel/build.gradle index e8b11046..997ff739 100644 --- a/code/libraries/random-write-funnel/build.gradle +++ b/code/libraries/random-write-funnel/build.gradle @@ -9,6 +9,8 @@ java { } dependencies { + implementation project(':code:libraries:array') + implementation libs.bundles.slf4j testImplementation libs.bundles.slf4j.test diff --git a/code/libraries/random-write-funnel/src/main/java/nu/marginalia/rwf/RandomFileAssembler.java b/code/libraries/random-write-funnel/src/main/java/nu/marginalia/rwf/RandomFileAssembler.java new file mode 100644 index 00000000..b7425280 --- /dev/null +++ b/code/libraries/random-write-funnel/src/main/java/nu/marginalia/rwf/RandomFileAssembler.java @@ -0,0 +1,125 @@ +package nu.marginalia.rwf; + +import nu.marginalia.array.LongArray; +import nu.marginalia.array.LongArrayFactory; + +import java.io.IOException; +import java.nio.file.Files; +import java.nio.file.Path; +import java.nio.file.StandardCopyOption; +import java.nio.file.StandardOpenOption; + +/** A RandomFileAssembler is a way to write a large file out of order + * in a way that is efficient for SSDs. + */ +public interface RandomFileAssembler extends AutoCloseable { + + void put(long address, long data) throws IOException; + void write(Path file) throws IOException; + void close() throws IOException; + + + /** Select the appropriate RandomFileAssembler implementation based on + * the system configuration. + */ + static RandomFileAssembler create(Path workDir, + long totalSize) throws IOException { + // If the system is configured to conserve memory, we use temp files + if (Boolean.getBoolean("system.conserve-memory")) { + return ofTempFiles(workDir); + } + + // If the file is small, we use straight mmap + if (totalSize < 128_000_000) { // 128M longs = 1 GB + return ofMmap(workDir, totalSize); + } + + // If the file is large, we use an in-memory buffer to avoid disk thrashing + return ofInMemoryAsssembly(totalSize); + + } + + + /** Create a RandomFileAssembler that writes to a series of small files. + * This has negligible memory overhead, but is slower than in-memory + * or mmap for small files. + */ + static RandomFileAssembler ofTempFiles(Path workDir) throws IOException { + + return new RandomFileAssembler() { + private final RandomWriteFunnel funnel = new RandomWriteFunnel(workDir, 10_000_000); + @Override + public void put(long address, long data) throws IOException { + funnel.put(address, data); + } + + @Override + public void write(Path file) throws IOException { + try (var channel = Files.newByteChannel(file, StandardOpenOption.WRITE, StandardOpenOption.CREATE)) { + funnel.write(channel); + } + } + + @Override + public void close() throws IOException { + funnel.close(); + } + }; + } + + /** Create a RandomFileAssembler that writes to a LongArray in memory. */ + static RandomFileAssembler ofInMemoryAsssembly(long size) { + return new RandomFileAssembler() { + private final LongArray buffer = LongArrayFactory.onHeapConfined(size); + + @Override + public void put(long address, long data) { + buffer.set(address, data); + } + + @Override + public void write(Path file) throws IOException { + buffer.write(file); + } + + @Override + public void close() { + buffer.close(); + } + }; + } + + /** Create a RandomFileAssembler that writes to a file using mmap. + * This is the fastest method for small files, but has a large memory + * overhead and is slow for large files, where the OS will start pushing + * changes to disk continuously. + * */ + static RandomFileAssembler ofMmap(Path destDir, long size) throws IOException { + return new RandomFileAssembler() { + private final Path workFile = Files.createTempFile(destDir, "mmap", ".dat"); + private final LongArray buffer = LongArrayFactory.mmapForWritingConfined(workFile, size); + + @Override + public void put(long address, long data) { + buffer.set(address, data); + } + + @Override + public void write(Path dest) throws IOException { + buffer.force(); + + Files.move(workFile, dest, + StandardCopyOption.REPLACE_EXISTING, + StandardCopyOption.ATOMIC_MOVE); + } + + @Override + public void close() throws IOException { + buffer.close(); + + // Catch the case where e.g. write() fails with an exception and workFile doesn't get moved + Files.deleteIfExists(workFile); + } + }; + } +} diff --git a/code/libraries/random-write-funnel/src/main/java/nu/marginalia/rwf/RandomWriteFunnel.java b/code/libraries/random-write-funnel/src/main/java/nu/marginalia/rwf/RandomWriteFunnel.java index a999597b..43f09a53 100644 --- a/code/libraries/random-write-funnel/src/main/java/nu/marginalia/rwf/RandomWriteFunnel.java +++ b/code/libraries/random-write-funnel/src/main/java/nu/marginalia/rwf/RandomWriteFunnel.java @@ -26,7 +26,7 @@ public class RandomWriteFunnel implements AutoCloseable { private final Path tempDir; private final int binSize; - public RandomWriteFunnel(Path tempDir, int binSize) throws IOException { + RandomWriteFunnel(Path tempDir, int binSize) throws IOException { this.binSize = binSize; this.tempDir = tempDir;