mirror of
https://github.com/MarginaliaSearch/MarginaliaSearch.git
synced 2025-02-24 05:18:58 +00:00
Tool for cleaning raw index files based on a predicate.
This commit is contained in:
parent
cb408dd737
commit
4d3ef0e3b3
@ -3,4 +3,6 @@ package nu.marginalia.util.array.algo;
|
|||||||
public interface BulkTransferArray<BufferType> {
|
public interface BulkTransferArray<BufferType> {
|
||||||
|
|
||||||
void set(long start, long end, BufferType buffer, int bufferStart);
|
void set(long start, long end, BufferType buffer, int bufferStart);
|
||||||
|
|
||||||
|
void get(long start, long end, BufferType buffer, int bufferStart);
|
||||||
}
|
}
|
||||||
|
@ -47,6 +47,7 @@ public interface LongArrayBase extends BulkTransferArray<LongBuffer> {
|
|||||||
set(start+i, buffer.get(i + bufferStart));
|
set(start+i, buffer.get(i + bufferStart));
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
default void get(long start, long end, LongBuffer buffer, int bufferStart) {
|
default void get(long start, long end, LongBuffer buffer, int bufferStart) {
|
||||||
for (int i = 0; i < (end-start); i++) {
|
for (int i = 0; i < (end-start); i++) {
|
||||||
buffer.put(i + bufferStart, get(start + i));
|
buffer.put(i + bufferStart, get(start + i));
|
||||||
|
@ -85,4 +85,23 @@ public class AbstractPagingArray<T extends BulkTransferArray<B>, B> {
|
|||||||
bufferStart += eOff - sOff;
|
bufferStart += eOff - sOff;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
public void get(long start, long end, B buffer, int bufferStart) {
|
||||||
|
assert end >= start;
|
||||||
|
|
||||||
|
int page = partitioningScheme.getPage(start);
|
||||||
|
|
||||||
|
long endPos;
|
||||||
|
|
||||||
|
for (long pos = start; pos < end; pos = endPos) {
|
||||||
|
endPos = partitioningScheme.getPageEnd(pos, end);
|
||||||
|
|
||||||
|
int sOff = partitioningScheme.getOffset(pos);
|
||||||
|
int eOff = partitioningScheme.getEndOffset(start, endPos);
|
||||||
|
|
||||||
|
pages[page++].get(sOff, eOff, buffer, bufferStart);
|
||||||
|
|
||||||
|
bufferStart += eOff - sOff;
|
||||||
|
}
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
@ -0,0 +1,71 @@
|
|||||||
|
package nu.marginalia.wmsa.edge.index.postings.journal.reader;
|
||||||
|
|
||||||
|
import nu.marginalia.util.array.LongArray;
|
||||||
|
|
||||||
|
import java.io.IOException;
|
||||||
|
import java.nio.ByteBuffer;
|
||||||
|
import java.nio.LongBuffer;
|
||||||
|
import java.nio.file.Path;
|
||||||
|
import java.util.function.Predicate;
|
||||||
|
|
||||||
|
public class SearchIndexJournalCleaner {
|
||||||
|
private final SearchIndexJournalReader reader;
|
||||||
|
|
||||||
|
public SearchIndexJournalCleaner(SearchIndexJournalReader reader) {
|
||||||
|
this.reader = reader;
|
||||||
|
}
|
||||||
|
|
||||||
|
private long dryRunForNewSize(Predicate<SearchIndexJournalReadEntry> entryPredicate) {
|
||||||
|
long pos = SearchIndexJournalReader.FILE_HEADER_SIZE_LONGS;
|
||||||
|
|
||||||
|
var pt = new ProgressTracker();
|
||||||
|
|
||||||
|
for (var entry : reader) {
|
||||||
|
if (entryPredicate.test(entry)) {
|
||||||
|
pos += entry.totalEntrySizeLongs();
|
||||||
|
pt.update(pos);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
return pos;
|
||||||
|
}
|
||||||
|
|
||||||
|
public void clean(Path outFile, Predicate<SearchIndexJournalReadEntry> entryPredicate) throws IOException {
|
||||||
|
|
||||||
|
System.out.println("Dry run");
|
||||||
|
long size = dryRunForNewSize(entryPredicate);
|
||||||
|
|
||||||
|
System.out.println("Copying");
|
||||||
|
LongArray outputArray = LongArray.mmapForWriting(outFile, size);
|
||||||
|
|
||||||
|
long pos = SearchIndexJournalReader.FILE_HEADER_SIZE_LONGS;
|
||||||
|
var pt = new ProgressTracker();
|
||||||
|
|
||||||
|
LongBuffer adequateBuffer = ByteBuffer.allocateDirect(100*1024*1024).asLongBuffer();
|
||||||
|
|
||||||
|
for (var entry : reader) {
|
||||||
|
if (entryPredicate.test(entry)) {
|
||||||
|
pos += entry.copyTo(pos, adequateBuffer, outputArray);
|
||||||
|
pt.update(pos);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
outputArray.set(0, pos*8);
|
||||||
|
outputArray.set(1, reader.fileHeader().wordCount());
|
||||||
|
|
||||||
|
outputArray.force();
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
class ProgressTracker {
|
||||||
|
long stepSize = 100*1024*1024;
|
||||||
|
long pos = 0;
|
||||||
|
|
||||||
|
public void update(long pos) {
|
||||||
|
if (this.pos / stepSize != pos / stepSize) {
|
||||||
|
System.out.printf("%d Mb\n", (800*pos)/stepSize);
|
||||||
|
}
|
||||||
|
this.pos = pos;
|
||||||
|
}
|
||||||
|
|
||||||
|
}
|
@ -5,6 +5,7 @@ import nu.marginalia.wmsa.edge.index.postings.journal.model.SearchIndexJournalEn
|
|||||||
import nu.marginalia.wmsa.edge.index.postings.journal.model.SearchIndexJournalEntryHeader;
|
import nu.marginalia.wmsa.edge.index.postings.journal.model.SearchIndexJournalEntryHeader;
|
||||||
|
|
||||||
import java.nio.ByteBuffer;
|
import java.nio.ByteBuffer;
|
||||||
|
import java.nio.LongBuffer;
|
||||||
|
|
||||||
public class SearchIndexJournalReadEntry {
|
public class SearchIndexJournalReadEntry {
|
||||||
private final long offset;
|
private final long offset;
|
||||||
@ -15,6 +16,7 @@ public class SearchIndexJournalReadEntry {
|
|||||||
SearchIndexJournalReadEntry(long offset, LongArray map, long committedSize) {
|
SearchIndexJournalReadEntry(long offset, LongArray map, long committedSize) {
|
||||||
this.map = map;
|
this.map = map;
|
||||||
this.committedSize = committedSize;
|
this.committedSize = committedSize;
|
||||||
|
|
||||||
final long sizeBlock = this.map.get(offset);
|
final long sizeBlock = this.map.get(offset);
|
||||||
final long docId = this.map.get(offset + 1);
|
final long docId = this.map.get(offset + 1);
|
||||||
final long meta = this.map.get(offset + 2);
|
final long meta = this.map.get(offset + 2);
|
||||||
@ -74,18 +76,23 @@ public class SearchIndexJournalReadEntry {
|
|||||||
}
|
}
|
||||||
|
|
||||||
public long nextId() {
|
public long nextId() {
|
||||||
return offset + SearchIndexJournalEntryHeader.HEADER_SIZE_LONGS + header.entrySize();
|
return offset + totalEntrySizeLongs();
|
||||||
}
|
}
|
||||||
|
|
||||||
public SearchIndexJournalReadEntry next() {
|
public SearchIndexJournalReadEntry next() {
|
||||||
return new SearchIndexJournalReadEntry(nextId(), map, committedSize);
|
return new SearchIndexJournalReadEntry(nextId(), map, committedSize);
|
||||||
}
|
}
|
||||||
|
|
||||||
public void copyToBuffer(ByteBuffer buffer) {
|
public long copyTo(long pos, LongBuffer adequateBuffer, LongArray destArray) {
|
||||||
var dest = buffer.asLongBuffer();
|
long size = totalEntrySizeLongs();
|
||||||
dest.position(buffer.position() * 8);
|
|
||||||
dest.limit(buffer.position() * 8 + header.entrySize() + SearchIndexJournalEntryHeader.HEADER_SIZE_LONGS);
|
map.get(offset, offset + size, adequateBuffer, 0);
|
||||||
map.get(offset, dest);
|
destArray.set(pos, pos + size, adequateBuffer, 0);
|
||||||
buffer.position(dest.limit() * 8);
|
|
||||||
|
return size;
|
||||||
|
}
|
||||||
|
|
||||||
|
public long totalEntrySizeLongs() {
|
||||||
|
return header.entrySize() + SearchIndexJournalEntryHeader.HEADER_SIZE_LONGS;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
@ -1,6 +1,7 @@
|
|||||||
package nu.marginalia.wmsa.edge.index.postings.journal.reader;
|
package nu.marginalia.wmsa.edge.index.postings.journal.reader;
|
||||||
|
|
||||||
import nu.marginalia.wmsa.edge.index.postings.journal.model.SearchIndexJournalEntry;
|
import nu.marginalia.wmsa.edge.index.postings.journal.model.SearchIndexJournalEntry;
|
||||||
|
import nu.marginalia.wmsa.edge.index.postings.journal.model.SearchIndexJournalFileHeader;
|
||||||
import nu.marginalia.wmsa.edge.index.postings.journal.model.SearchIndexJournalStatistics;
|
import nu.marginalia.wmsa.edge.index.postings.journal.model.SearchIndexJournalStatistics;
|
||||||
import org.jetbrains.annotations.NotNull;
|
import org.jetbrains.annotations.NotNull;
|
||||||
|
|
||||||
@ -15,6 +16,8 @@ public interface SearchIndexJournalReader extends Iterable<SearchIndexJournalRea
|
|||||||
return new long[SearchIndexJournalEntry.MAX_LENGTH * SearchIndexJournalEntry.ENTRY_SIZE];
|
return new long[SearchIndexJournalEntry.MAX_LENGTH * SearchIndexJournalEntry.ENTRY_SIZE];
|
||||||
}
|
}
|
||||||
|
|
||||||
|
SearchIndexJournalFileHeader fileHeader();
|
||||||
|
|
||||||
SearchIndexJournalStatistics getStatistics();
|
SearchIndexJournalStatistics getStatistics();
|
||||||
|
|
||||||
void forEachWordId(IntConsumer consumer);
|
void forEachWordId(IntConsumer consumer);
|
||||||
|
@ -46,6 +46,10 @@ public class SearchIndexJournalReaderSingleFile implements SearchIndexJournalRea
|
|||||||
this.entryPredicate = entryPredicate;
|
this.entryPredicate = entryPredicate;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
public SearchIndexJournalFileHeader fileHeader() {
|
||||||
|
return fileHeader;
|
||||||
|
}
|
||||||
|
|
||||||
public boolean filter(SearchIndexJournalReadEntry entry) {
|
public boolean filter(SearchIndexJournalReadEntry entry) {
|
||||||
return entryPredicate == null || entryPredicate.test(entry);
|
return entryPredicate == null || entryPredicate.test(entry);
|
||||||
}
|
}
|
||||||
|
@ -0,0 +1,28 @@
|
|||||||
|
package nu.marginalia.wmsa.edge.tools;
|
||||||
|
|
||||||
|
import nu.marginalia.util.array.LongArray;
|
||||||
|
import nu.marginalia.wmsa.edge.index.postings.journal.reader.SearchIndexJournalCleaner;
|
||||||
|
import nu.marginalia.wmsa.edge.index.postings.journal.reader.SearchIndexJournalReadEntry;
|
||||||
|
import nu.marginalia.wmsa.edge.index.postings.journal.reader.SearchIndexJournalReaderSingleFile;
|
||||||
|
|
||||||
|
import java.io.IOException;
|
||||||
|
import java.nio.file.Path;
|
||||||
|
|
||||||
|
import static nu.marginalia.wmsa.edge.index.model.EdgePageDocumentFlags.Simple;
|
||||||
|
|
||||||
|
public class StripSimpleJournalEntriesToolMain {
|
||||||
|
|
||||||
|
public static void main(String[] args) throws IOException {
|
||||||
|
Path input = Path.of(args[0]);
|
||||||
|
Path output = Path.of(args[1]);
|
||||||
|
|
||||||
|
new SearchIndexJournalCleaner(new SearchIndexJournalReaderSingleFile(LongArray.mmapRead(input)))
|
||||||
|
.clean(output, StripSimpleJournalEntriesToolMain::retainEntry);
|
||||||
|
|
||||||
|
System.out.println("All done!");
|
||||||
|
}
|
||||||
|
|
||||||
|
private static boolean retainEntry(SearchIndexJournalReadEntry entry) {
|
||||||
|
return (entry.header.documentMeta() & Simple.asBit()) == 0;
|
||||||
|
}
|
||||||
|
}
|
Loading…
Reference in New Issue
Block a user