Tool for cleaning raw index files based on a predicate.

This commit is contained in:
Viktor Lofgren 2023-01-11 16:11:29 +01:00
parent cb408dd737
commit 4d3ef0e3b3
8 changed files with 142 additions and 7 deletions

View File

@ -3,4 +3,6 @@ package nu.marginalia.util.array.algo;
public interface BulkTransferArray<BufferType> {
void set(long start, long end, BufferType buffer, int bufferStart);
void get(long start, long end, BufferType buffer, int bufferStart);
}

View File

@ -47,6 +47,7 @@ public interface LongArrayBase extends BulkTransferArray<LongBuffer> {
set(start+i, buffer.get(i + bufferStart));
}
}
default void get(long start, long end, LongBuffer buffer, int bufferStart) {
for (int i = 0; i < (end-start); i++) {
buffer.put(i + bufferStart, get(start + i));

View File

@ -85,4 +85,23 @@ public class AbstractPagingArray<T extends BulkTransferArray<B>, B> {
bufferStart += eOff - sOff;
}
}
public void get(long start, long end, B buffer, int bufferStart) {
assert end >= start;
int page = partitioningScheme.getPage(start);
long endPos;
for (long pos = start; pos < end; pos = endPos) {
endPos = partitioningScheme.getPageEnd(pos, end);
int sOff = partitioningScheme.getOffset(pos);
int eOff = partitioningScheme.getEndOffset(start, endPos);
pages[page++].get(sOff, eOff, buffer, bufferStart);
bufferStart += eOff - sOff;
}
}
}

View File

@ -0,0 +1,71 @@
package nu.marginalia.wmsa.edge.index.postings.journal.reader;
import nu.marginalia.util.array.LongArray;
import java.io.IOException;
import java.nio.ByteBuffer;
import java.nio.LongBuffer;
import java.nio.file.Path;
import java.util.function.Predicate;
public class SearchIndexJournalCleaner {
private final SearchIndexJournalReader reader;
public SearchIndexJournalCleaner(SearchIndexJournalReader reader) {
this.reader = reader;
}
private long dryRunForNewSize(Predicate<SearchIndexJournalReadEntry> entryPredicate) {
long pos = SearchIndexJournalReader.FILE_HEADER_SIZE_LONGS;
var pt = new ProgressTracker();
for (var entry : reader) {
if (entryPredicate.test(entry)) {
pos += entry.totalEntrySizeLongs();
pt.update(pos);
}
}
return pos;
}
public void clean(Path outFile, Predicate<SearchIndexJournalReadEntry> entryPredicate) throws IOException {
System.out.println("Dry run");
long size = dryRunForNewSize(entryPredicate);
System.out.println("Copying");
LongArray outputArray = LongArray.mmapForWriting(outFile, size);
long pos = SearchIndexJournalReader.FILE_HEADER_SIZE_LONGS;
var pt = new ProgressTracker();
LongBuffer adequateBuffer = ByteBuffer.allocateDirect(100*1024*1024).asLongBuffer();
for (var entry : reader) {
if (entryPredicate.test(entry)) {
pos += entry.copyTo(pos, adequateBuffer, outputArray);
pt.update(pos);
}
}
outputArray.set(0, pos*8);
outputArray.set(1, reader.fileHeader().wordCount());
outputArray.force();
}
}
class ProgressTracker {
long stepSize = 100*1024*1024;
long pos = 0;
public void update(long pos) {
if (this.pos / stepSize != pos / stepSize) {
System.out.printf("%d Mb\n", (800*pos)/stepSize);
}
this.pos = pos;
}
}

View File

@ -5,6 +5,7 @@ import nu.marginalia.wmsa.edge.index.postings.journal.model.SearchIndexJournalEn
import nu.marginalia.wmsa.edge.index.postings.journal.model.SearchIndexJournalEntryHeader;
import java.nio.ByteBuffer;
import java.nio.LongBuffer;
public class SearchIndexJournalReadEntry {
private final long offset;
@ -15,6 +16,7 @@ public class SearchIndexJournalReadEntry {
SearchIndexJournalReadEntry(long offset, LongArray map, long committedSize) {
this.map = map;
this.committedSize = committedSize;
final long sizeBlock = this.map.get(offset);
final long docId = this.map.get(offset + 1);
final long meta = this.map.get(offset + 2);
@ -74,18 +76,23 @@ public class SearchIndexJournalReadEntry {
}
public long nextId() {
return offset + SearchIndexJournalEntryHeader.HEADER_SIZE_LONGS + header.entrySize();
return offset + totalEntrySizeLongs();
}
public SearchIndexJournalReadEntry next() {
return new SearchIndexJournalReadEntry(nextId(), map, committedSize);
}
public void copyToBuffer(ByteBuffer buffer) {
var dest = buffer.asLongBuffer();
dest.position(buffer.position() * 8);
dest.limit(buffer.position() * 8 + header.entrySize() + SearchIndexJournalEntryHeader.HEADER_SIZE_LONGS);
map.get(offset, dest);
buffer.position(dest.limit() * 8);
public long copyTo(long pos, LongBuffer adequateBuffer, LongArray destArray) {
long size = totalEntrySizeLongs();
map.get(offset, offset + size, adequateBuffer, 0);
destArray.set(pos, pos + size, adequateBuffer, 0);
return size;
}
public long totalEntrySizeLongs() {
return header.entrySize() + SearchIndexJournalEntryHeader.HEADER_SIZE_LONGS;
}
}

View File

@ -1,6 +1,7 @@
package nu.marginalia.wmsa.edge.index.postings.journal.reader;
import nu.marginalia.wmsa.edge.index.postings.journal.model.SearchIndexJournalEntry;
import nu.marginalia.wmsa.edge.index.postings.journal.model.SearchIndexJournalFileHeader;
import nu.marginalia.wmsa.edge.index.postings.journal.model.SearchIndexJournalStatistics;
import org.jetbrains.annotations.NotNull;
@ -15,6 +16,8 @@ public interface SearchIndexJournalReader extends Iterable<SearchIndexJournalRea
return new long[SearchIndexJournalEntry.MAX_LENGTH * SearchIndexJournalEntry.ENTRY_SIZE];
}
SearchIndexJournalFileHeader fileHeader();
SearchIndexJournalStatistics getStatistics();
void forEachWordId(IntConsumer consumer);

View File

@ -46,6 +46,10 @@ public class SearchIndexJournalReaderSingleFile implements SearchIndexJournalRea
this.entryPredicate = entryPredicate;
}
public SearchIndexJournalFileHeader fileHeader() {
return fileHeader;
}
public boolean filter(SearchIndexJournalReadEntry entry) {
return entryPredicate == null || entryPredicate.test(entry);
}

View File

@ -0,0 +1,28 @@
package nu.marginalia.wmsa.edge.tools;
import nu.marginalia.util.array.LongArray;
import nu.marginalia.wmsa.edge.index.postings.journal.reader.SearchIndexJournalCleaner;
import nu.marginalia.wmsa.edge.index.postings.journal.reader.SearchIndexJournalReadEntry;
import nu.marginalia.wmsa.edge.index.postings.journal.reader.SearchIndexJournalReaderSingleFile;
import java.io.IOException;
import java.nio.file.Path;
import static nu.marginalia.wmsa.edge.index.model.EdgePageDocumentFlags.Simple;
public class StripSimpleJournalEntriesToolMain {
public static void main(String[] args) throws IOException {
Path input = Path.of(args[0]);
Path output = Path.of(args[1]);
new SearchIndexJournalCleaner(new SearchIndexJournalReaderSingleFile(LongArray.mmapRead(input)))
.clean(output, StripSimpleJournalEntriesToolMain::retainEntry);
System.out.println("All done!");
}
private static boolean retainEntry(SearchIndexJournalReadEntry entry) {
return (entry.header.documentMeta() & Simple.asBit()) == 0;
}
}