mirror of
https://github.com/MarginaliaSearch/MarginaliaSearch.git
synced 2025-02-23 21:18:58 +00:00
Tool for cleaning raw index files based on a predicate.
This commit is contained in:
parent
cb408dd737
commit
4d3ef0e3b3
@ -3,4 +3,6 @@ package nu.marginalia.util.array.algo;
|
||||
public interface BulkTransferArray<BufferType> {
|
||||
|
||||
void set(long start, long end, BufferType buffer, int bufferStart);
|
||||
|
||||
void get(long start, long end, BufferType buffer, int bufferStart);
|
||||
}
|
||||
|
@ -47,6 +47,7 @@ public interface LongArrayBase extends BulkTransferArray<LongBuffer> {
|
||||
set(start+i, buffer.get(i + bufferStart));
|
||||
}
|
||||
}
|
||||
|
||||
default void get(long start, long end, LongBuffer buffer, int bufferStart) {
|
||||
for (int i = 0; i < (end-start); i++) {
|
||||
buffer.put(i + bufferStart, get(start + i));
|
||||
|
@ -85,4 +85,23 @@ public class AbstractPagingArray<T extends BulkTransferArray<B>, B> {
|
||||
bufferStart += eOff - sOff;
|
||||
}
|
||||
}
|
||||
|
||||
public void get(long start, long end, B buffer, int bufferStart) {
|
||||
assert end >= start;
|
||||
|
||||
int page = partitioningScheme.getPage(start);
|
||||
|
||||
long endPos;
|
||||
|
||||
for (long pos = start; pos < end; pos = endPos) {
|
||||
endPos = partitioningScheme.getPageEnd(pos, end);
|
||||
|
||||
int sOff = partitioningScheme.getOffset(pos);
|
||||
int eOff = partitioningScheme.getEndOffset(start, endPos);
|
||||
|
||||
pages[page++].get(sOff, eOff, buffer, bufferStart);
|
||||
|
||||
bufferStart += eOff - sOff;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
@ -0,0 +1,71 @@
|
||||
package nu.marginalia.wmsa.edge.index.postings.journal.reader;
|
||||
|
||||
import nu.marginalia.util.array.LongArray;
|
||||
|
||||
import java.io.IOException;
|
||||
import java.nio.ByteBuffer;
|
||||
import java.nio.LongBuffer;
|
||||
import java.nio.file.Path;
|
||||
import java.util.function.Predicate;
|
||||
|
||||
public class SearchIndexJournalCleaner {
|
||||
private final SearchIndexJournalReader reader;
|
||||
|
||||
public SearchIndexJournalCleaner(SearchIndexJournalReader reader) {
|
||||
this.reader = reader;
|
||||
}
|
||||
|
||||
private long dryRunForNewSize(Predicate<SearchIndexJournalReadEntry> entryPredicate) {
|
||||
long pos = SearchIndexJournalReader.FILE_HEADER_SIZE_LONGS;
|
||||
|
||||
var pt = new ProgressTracker();
|
||||
|
||||
for (var entry : reader) {
|
||||
if (entryPredicate.test(entry)) {
|
||||
pos += entry.totalEntrySizeLongs();
|
||||
pt.update(pos);
|
||||
}
|
||||
}
|
||||
|
||||
return pos;
|
||||
}
|
||||
|
||||
public void clean(Path outFile, Predicate<SearchIndexJournalReadEntry> entryPredicate) throws IOException {
|
||||
|
||||
System.out.println("Dry run");
|
||||
long size = dryRunForNewSize(entryPredicate);
|
||||
|
||||
System.out.println("Copying");
|
||||
LongArray outputArray = LongArray.mmapForWriting(outFile, size);
|
||||
|
||||
long pos = SearchIndexJournalReader.FILE_HEADER_SIZE_LONGS;
|
||||
var pt = new ProgressTracker();
|
||||
|
||||
LongBuffer adequateBuffer = ByteBuffer.allocateDirect(100*1024*1024).asLongBuffer();
|
||||
|
||||
for (var entry : reader) {
|
||||
if (entryPredicate.test(entry)) {
|
||||
pos += entry.copyTo(pos, adequateBuffer, outputArray);
|
||||
pt.update(pos);
|
||||
}
|
||||
}
|
||||
|
||||
outputArray.set(0, pos*8);
|
||||
outputArray.set(1, reader.fileHeader().wordCount());
|
||||
|
||||
outputArray.force();
|
||||
}
|
||||
}
|
||||
|
||||
class ProgressTracker {
|
||||
long stepSize = 100*1024*1024;
|
||||
long pos = 0;
|
||||
|
||||
public void update(long pos) {
|
||||
if (this.pos / stepSize != pos / stepSize) {
|
||||
System.out.printf("%d Mb\n", (800*pos)/stepSize);
|
||||
}
|
||||
this.pos = pos;
|
||||
}
|
||||
|
||||
}
|
@ -5,6 +5,7 @@ import nu.marginalia.wmsa.edge.index.postings.journal.model.SearchIndexJournalEn
|
||||
import nu.marginalia.wmsa.edge.index.postings.journal.model.SearchIndexJournalEntryHeader;
|
||||
|
||||
import java.nio.ByteBuffer;
|
||||
import java.nio.LongBuffer;
|
||||
|
||||
public class SearchIndexJournalReadEntry {
|
||||
private final long offset;
|
||||
@ -15,6 +16,7 @@ public class SearchIndexJournalReadEntry {
|
||||
SearchIndexJournalReadEntry(long offset, LongArray map, long committedSize) {
|
||||
this.map = map;
|
||||
this.committedSize = committedSize;
|
||||
|
||||
final long sizeBlock = this.map.get(offset);
|
||||
final long docId = this.map.get(offset + 1);
|
||||
final long meta = this.map.get(offset + 2);
|
||||
@ -74,18 +76,23 @@ public class SearchIndexJournalReadEntry {
|
||||
}
|
||||
|
||||
public long nextId() {
|
||||
return offset + SearchIndexJournalEntryHeader.HEADER_SIZE_LONGS + header.entrySize();
|
||||
return offset + totalEntrySizeLongs();
|
||||
}
|
||||
|
||||
public SearchIndexJournalReadEntry next() {
|
||||
return new SearchIndexJournalReadEntry(nextId(), map, committedSize);
|
||||
}
|
||||
|
||||
public void copyToBuffer(ByteBuffer buffer) {
|
||||
var dest = buffer.asLongBuffer();
|
||||
dest.position(buffer.position() * 8);
|
||||
dest.limit(buffer.position() * 8 + header.entrySize() + SearchIndexJournalEntryHeader.HEADER_SIZE_LONGS);
|
||||
map.get(offset, dest);
|
||||
buffer.position(dest.limit() * 8);
|
||||
public long copyTo(long pos, LongBuffer adequateBuffer, LongArray destArray) {
|
||||
long size = totalEntrySizeLongs();
|
||||
|
||||
map.get(offset, offset + size, adequateBuffer, 0);
|
||||
destArray.set(pos, pos + size, adequateBuffer, 0);
|
||||
|
||||
return size;
|
||||
}
|
||||
|
||||
public long totalEntrySizeLongs() {
|
||||
return header.entrySize() + SearchIndexJournalEntryHeader.HEADER_SIZE_LONGS;
|
||||
}
|
||||
}
|
||||
|
@ -1,6 +1,7 @@
|
||||
package nu.marginalia.wmsa.edge.index.postings.journal.reader;
|
||||
|
||||
import nu.marginalia.wmsa.edge.index.postings.journal.model.SearchIndexJournalEntry;
|
||||
import nu.marginalia.wmsa.edge.index.postings.journal.model.SearchIndexJournalFileHeader;
|
||||
import nu.marginalia.wmsa.edge.index.postings.journal.model.SearchIndexJournalStatistics;
|
||||
import org.jetbrains.annotations.NotNull;
|
||||
|
||||
@ -15,6 +16,8 @@ public interface SearchIndexJournalReader extends Iterable<SearchIndexJournalRea
|
||||
return new long[SearchIndexJournalEntry.MAX_LENGTH * SearchIndexJournalEntry.ENTRY_SIZE];
|
||||
}
|
||||
|
||||
SearchIndexJournalFileHeader fileHeader();
|
||||
|
||||
SearchIndexJournalStatistics getStatistics();
|
||||
|
||||
void forEachWordId(IntConsumer consumer);
|
||||
|
@ -46,6 +46,10 @@ public class SearchIndexJournalReaderSingleFile implements SearchIndexJournalRea
|
||||
this.entryPredicate = entryPredicate;
|
||||
}
|
||||
|
||||
public SearchIndexJournalFileHeader fileHeader() {
|
||||
return fileHeader;
|
||||
}
|
||||
|
||||
public boolean filter(SearchIndexJournalReadEntry entry) {
|
||||
return entryPredicate == null || entryPredicate.test(entry);
|
||||
}
|
||||
|
@ -0,0 +1,28 @@
|
||||
package nu.marginalia.wmsa.edge.tools;
|
||||
|
||||
import nu.marginalia.util.array.LongArray;
|
||||
import nu.marginalia.wmsa.edge.index.postings.journal.reader.SearchIndexJournalCleaner;
|
||||
import nu.marginalia.wmsa.edge.index.postings.journal.reader.SearchIndexJournalReadEntry;
|
||||
import nu.marginalia.wmsa.edge.index.postings.journal.reader.SearchIndexJournalReaderSingleFile;
|
||||
|
||||
import java.io.IOException;
|
||||
import java.nio.file.Path;
|
||||
|
||||
import static nu.marginalia.wmsa.edge.index.model.EdgePageDocumentFlags.Simple;
|
||||
|
||||
public class StripSimpleJournalEntriesToolMain {
|
||||
|
||||
public static void main(String[] args) throws IOException {
|
||||
Path input = Path.of(args[0]);
|
||||
Path output = Path.of(args[1]);
|
||||
|
||||
new SearchIndexJournalCleaner(new SearchIndexJournalReaderSingleFile(LongArray.mmapRead(input)))
|
||||
.clean(output, StripSimpleJournalEntriesToolMain::retainEntry);
|
||||
|
||||
System.out.println("All done!");
|
||||
}
|
||||
|
||||
private static boolean retainEntry(SearchIndexJournalReadEntry entry) {
|
||||
return (entry.header.documentMeta() & Simple.asBit()) == 0;
|
||||
}
|
||||
}
|
Loading…
Reference in New Issue
Block a user