From 4d3ef0e3b3f0790cd054e80628a5579db84a35bc Mon Sep 17 00:00:00 2001 From: Viktor Lofgren Date: Wed, 11 Jan 2023 16:11:29 +0100 Subject: [PATCH] Tool for cleaning raw index files based on a predicate. --- .../util/array/algo/BulkTransferArray.java | 2 + .../util/array/algo/LongArrayBase.java | 1 + .../util/array/page/AbstractPagingArray.java | 19 +++++ .../reader/SearchIndexJournalCleaner.java | 71 +++++++++++++++++++ .../reader/SearchIndexJournalReadEntry.java | 21 ++++-- .../reader/SearchIndexJournalReader.java | 3 + .../SearchIndexJournalReaderSingleFile.java | 4 ++ .../StripSimpleJournalEntriesToolMain.java | 28 ++++++++ 8 files changed, 142 insertions(+), 7 deletions(-) create mode 100644 marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/postings/journal/reader/SearchIndexJournalCleaner.java create mode 100644 marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/tools/StripSimpleJournalEntriesToolMain.java diff --git a/marginalia_nu/src/main/java/nu/marginalia/util/array/algo/BulkTransferArray.java b/marginalia_nu/src/main/java/nu/marginalia/util/array/algo/BulkTransferArray.java index d01d3716..bf0df57d 100644 --- a/marginalia_nu/src/main/java/nu/marginalia/util/array/algo/BulkTransferArray.java +++ b/marginalia_nu/src/main/java/nu/marginalia/util/array/algo/BulkTransferArray.java @@ -3,4 +3,6 @@ package nu.marginalia.util.array.algo; public interface BulkTransferArray { void set(long start, long end, BufferType buffer, int bufferStart); + + void get(long start, long end, BufferType buffer, int bufferStart); } diff --git a/marginalia_nu/src/main/java/nu/marginalia/util/array/algo/LongArrayBase.java b/marginalia_nu/src/main/java/nu/marginalia/util/array/algo/LongArrayBase.java index 03f18bcc..508fdf9a 100644 --- a/marginalia_nu/src/main/java/nu/marginalia/util/array/algo/LongArrayBase.java +++ b/marginalia_nu/src/main/java/nu/marginalia/util/array/algo/LongArrayBase.java @@ -47,6 +47,7 @@ public interface LongArrayBase extends BulkTransferArray { set(start+i, buffer.get(i + bufferStart)); } } + default void get(long start, long end, LongBuffer buffer, int bufferStart) { for (int i = 0; i < (end-start); i++) { buffer.put(i + bufferStart, get(start + i)); diff --git a/marginalia_nu/src/main/java/nu/marginalia/util/array/page/AbstractPagingArray.java b/marginalia_nu/src/main/java/nu/marginalia/util/array/page/AbstractPagingArray.java index 43a48c16..c772d43e 100644 --- a/marginalia_nu/src/main/java/nu/marginalia/util/array/page/AbstractPagingArray.java +++ b/marginalia_nu/src/main/java/nu/marginalia/util/array/page/AbstractPagingArray.java @@ -85,4 +85,23 @@ public class AbstractPagingArray, B> { bufferStart += eOff - sOff; } } + + public void get(long start, long end, B buffer, int bufferStart) { + assert end >= start; + + int page = partitioningScheme.getPage(start); + + long endPos; + + for (long pos = start; pos < end; pos = endPos) { + endPos = partitioningScheme.getPageEnd(pos, end); + + int sOff = partitioningScheme.getOffset(pos); + int eOff = partitioningScheme.getEndOffset(start, endPos); + + pages[page++].get(sOff, eOff, buffer, bufferStart); + + bufferStart += eOff - sOff; + } + } } diff --git a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/postings/journal/reader/SearchIndexJournalCleaner.java b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/postings/journal/reader/SearchIndexJournalCleaner.java new file mode 100644 index 00000000..8e685a2a --- /dev/null +++ b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/postings/journal/reader/SearchIndexJournalCleaner.java @@ -0,0 +1,71 @@ +package nu.marginalia.wmsa.edge.index.postings.journal.reader; + +import nu.marginalia.util.array.LongArray; + +import java.io.IOException; +import java.nio.ByteBuffer; +import java.nio.LongBuffer; +import java.nio.file.Path; +import java.util.function.Predicate; + +public class SearchIndexJournalCleaner { + private final SearchIndexJournalReader reader; + + public SearchIndexJournalCleaner(SearchIndexJournalReader reader) { + this.reader = reader; + } + + private long dryRunForNewSize(Predicate entryPredicate) { + long pos = SearchIndexJournalReader.FILE_HEADER_SIZE_LONGS; + + var pt = new ProgressTracker(); + + for (var entry : reader) { + if (entryPredicate.test(entry)) { + pos += entry.totalEntrySizeLongs(); + pt.update(pos); + } + } + + return pos; + } + + public void clean(Path outFile, Predicate entryPredicate) throws IOException { + + System.out.println("Dry run"); + long size = dryRunForNewSize(entryPredicate); + + System.out.println("Copying"); + LongArray outputArray = LongArray.mmapForWriting(outFile, size); + + long pos = SearchIndexJournalReader.FILE_HEADER_SIZE_LONGS; + var pt = new ProgressTracker(); + + LongBuffer adequateBuffer = ByteBuffer.allocateDirect(100*1024*1024).asLongBuffer(); + + for (var entry : reader) { + if (entryPredicate.test(entry)) { + pos += entry.copyTo(pos, adequateBuffer, outputArray); + pt.update(pos); + } + } + + outputArray.set(0, pos*8); + outputArray.set(1, reader.fileHeader().wordCount()); + + outputArray.force(); + } +} + +class ProgressTracker { + long stepSize = 100*1024*1024; + long pos = 0; + + public void update(long pos) { + if (this.pos / stepSize != pos / stepSize) { + System.out.printf("%d Mb\n", (800*pos)/stepSize); + } + this.pos = pos; + } + +} diff --git a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/postings/journal/reader/SearchIndexJournalReadEntry.java b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/postings/journal/reader/SearchIndexJournalReadEntry.java index 40c2a433..97cd9e98 100644 --- a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/postings/journal/reader/SearchIndexJournalReadEntry.java +++ b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/postings/journal/reader/SearchIndexJournalReadEntry.java @@ -5,6 +5,7 @@ import nu.marginalia.wmsa.edge.index.postings.journal.model.SearchIndexJournalEn import nu.marginalia.wmsa.edge.index.postings.journal.model.SearchIndexJournalEntryHeader; import java.nio.ByteBuffer; +import java.nio.LongBuffer; public class SearchIndexJournalReadEntry { private final long offset; @@ -15,6 +16,7 @@ public class SearchIndexJournalReadEntry { SearchIndexJournalReadEntry(long offset, LongArray map, long committedSize) { this.map = map; this.committedSize = committedSize; + final long sizeBlock = this.map.get(offset); final long docId = this.map.get(offset + 1); final long meta = this.map.get(offset + 2); @@ -74,18 +76,23 @@ public class SearchIndexJournalReadEntry { } public long nextId() { - return offset + SearchIndexJournalEntryHeader.HEADER_SIZE_LONGS + header.entrySize(); + return offset + totalEntrySizeLongs(); } public SearchIndexJournalReadEntry next() { return new SearchIndexJournalReadEntry(nextId(), map, committedSize); } - public void copyToBuffer(ByteBuffer buffer) { - var dest = buffer.asLongBuffer(); - dest.position(buffer.position() * 8); - dest.limit(buffer.position() * 8 + header.entrySize() + SearchIndexJournalEntryHeader.HEADER_SIZE_LONGS); - map.get(offset, dest); - buffer.position(dest.limit() * 8); + public long copyTo(long pos, LongBuffer adequateBuffer, LongArray destArray) { + long size = totalEntrySizeLongs(); + + map.get(offset, offset + size, adequateBuffer, 0); + destArray.set(pos, pos + size, adequateBuffer, 0); + + return size; + } + + public long totalEntrySizeLongs() { + return header.entrySize() + SearchIndexJournalEntryHeader.HEADER_SIZE_LONGS; } } diff --git a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/postings/journal/reader/SearchIndexJournalReader.java b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/postings/journal/reader/SearchIndexJournalReader.java index 71811772..a8751f85 100644 --- a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/postings/journal/reader/SearchIndexJournalReader.java +++ b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/postings/journal/reader/SearchIndexJournalReader.java @@ -1,6 +1,7 @@ package nu.marginalia.wmsa.edge.index.postings.journal.reader; import nu.marginalia.wmsa.edge.index.postings.journal.model.SearchIndexJournalEntry; +import nu.marginalia.wmsa.edge.index.postings.journal.model.SearchIndexJournalFileHeader; import nu.marginalia.wmsa.edge.index.postings.journal.model.SearchIndexJournalStatistics; import org.jetbrains.annotations.NotNull; @@ -15,6 +16,8 @@ public interface SearchIndexJournalReader extends Iterable