(index-reverse) Simplify priority index

* Do not emit a documents file
* Do not interlace metadata or offsets with doc ids
This commit is contained in:
Viktor Lofgren 2024-07-06 16:12:29 +02:00
parent 85c99ae808
commit fa36689597
17 changed files with 244 additions and 78 deletions

View File

@ -21,7 +21,7 @@ import java.nio.file.Files;
import java.nio.file.Path; import java.nio.file.Path;
import java.util.concurrent.Executors; import java.util.concurrent.Executors;
public class ReverseIndexReader { public class FullReverseIndexReader {
private final LongArray words; private final LongArray words;
private final LongArray documents; private final LongArray documents;
private final long wordsDataOffset; private final long wordsDataOffset;
@ -31,10 +31,10 @@ public class ReverseIndexReader {
private final PositionsFileReader positionsFileReader; private final PositionsFileReader positionsFileReader;
public ReverseIndexReader(String name, public FullReverseIndexReader(String name,
Path words, Path words,
Path documents, Path documents,
PositionsFileReader positionsFileReader) throws IOException { PositionsFileReader positionsFileReader) throws IOException {
this.name = name; this.name = name;
this.positionsFileReader = positionsFileReader; this.positionsFileReader = positionsFileReader;
@ -138,7 +138,7 @@ public class ReverseIndexReader {
private BTreeReader createReaderNew(long offset) { private BTreeReader createReaderNew(long offset) {
return new BTreeReader( return new BTreeReader(
documents, documents,
ReverseIndexParameters.docsBTreeContext, ReverseIndexParameters.fullDocsBTreeContext,
offset); offset);
} }

View File

@ -0,0 +1,99 @@
package nu.marginalia.index;
import nu.marginalia.array.LongArray;
import nu.marginalia.array.LongArrayFactory;
import nu.marginalia.btree.BTreeReader;
import nu.marginalia.index.query.EmptyEntrySource;
import nu.marginalia.index.query.EntrySource;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import java.io.IOException;
import java.nio.file.Files;
import java.nio.file.Path;
public class PrioReverseIndexReader {
private final LongArray words;
private final LongArray documents;
private final long wordsDataOffset;
private final Logger logger = LoggerFactory.getLogger(getClass());
private final BTreeReader wordsBTreeReader;
private final String name;
public PrioReverseIndexReader(String name,
Path words,
Path documents) throws IOException {
this.name = name;
if (!Files.exists(words) || !Files.exists(documents)) {
this.words = null;
this.documents = null;
this.wordsBTreeReader = null;
this.wordsDataOffset = -1;
return;
}
logger.info("Switching reverse index");
this.words = LongArrayFactory.mmapForReadingShared(words);
this.documents = LongArrayFactory.mmapForReadingShared(documents);
wordsBTreeReader = new BTreeReader(this.words, ReverseIndexParameters.wordsBTreeContext, 0);
wordsDataOffset = wordsBTreeReader.getHeader().dataOffsetLongs();
}
/** Calculate the offset of the word in the documents.
* If the return-value is negative, the term does not exist
* in the index.
*/
long wordOffset(long termId) {
long idx = wordsBTreeReader.findEntry(termId);
if (idx < 0)
return -1L;
return words.get(wordsDataOffset + idx + 1);
}
public EntrySource documents(long termId) {
if (null == words) {
logger.warn("Reverse index is not ready, dropping query");
return new EmptyEntrySource();
}
long offset = wordOffset(termId);
if (offset < 0) // No documents
return new EmptyEntrySource();
return new ReverseIndexEntrySource(name, createReaderNew(offset), 1, termId);
}
/** Return the number of documents with the termId in the index */
public int numDocuments(long termId) {
long offset = wordOffset(termId);
if (offset < 0)
return 0;
return createReaderNew(offset).numEntries();
}
/** Create a BTreeReader for the document offset associated with a termId */
private BTreeReader createReaderNew(long offset) {
return new BTreeReader(
documents,
ReverseIndexParameters.prioDocsBTreeContext,
offset);
}
public void close() {
if (documents != null)
documents.close();
if (words != null)
words.close();
}
}

View File

@ -5,6 +5,7 @@ import nu.marginalia.btree.model.BTreeContext;
public class ReverseIndexParameters public class ReverseIndexParameters
{ {
public static final BTreeContext docsBTreeContext = new BTreeContext(5, 2, BTreeBlockSize.BS_2048); public static final BTreeContext prioDocsBTreeContext = new BTreeContext(5, 1, BTreeBlockSize.BS_2048);
public static final BTreeContext fullDocsBTreeContext = new BTreeContext(5, 2, BTreeBlockSize.BS_2048);
public static final BTreeContext wordsBTreeContext = new BTreeContext(5, 2, BTreeBlockSize.BS_2048); public static final BTreeContext wordsBTreeContext = new BTreeContext(5, 2, BTreeBlockSize.BS_2048);
} }

View File

@ -22,7 +22,7 @@ public class ReverseIndexSelfTest {
public static void runSelfTest2(LongArray wordsDataRange, LongArray documents) { public static void runSelfTest2(LongArray wordsDataRange, LongArray documents) {
logger.info("Starting test 2"); logger.info("Starting test 2");
for (long i = 1; i < wordsDataRange.size(); i+=2) { for (long i = 1; i < wordsDataRange.size(); i+=2) {
var docsBTreeReader = new BTreeReader(documents, ReverseIndexParameters.docsBTreeContext, wordsDataRange.get(i)); var docsBTreeReader = new BTreeReader(documents, ReverseIndexParameters.fullDocsBTreeContext, wordsDataRange.get(i));
var header = docsBTreeReader.getHeader(); var header = docsBTreeReader.getHeader();
var docRange = documents.range(header.dataOffsetLongs(), header.dataOffsetLongs() + header.numEntries() * 2L); var docRange = documents.range(header.dataOffsetLongs(), header.dataOffsetLongs() + header.numEntries() * 2L);
@ -49,7 +49,7 @@ public class ReverseIndexSelfTest {
public static void runSelfTest4(LongArray wordsDataRange, LongArray documents) { public static void runSelfTest4(LongArray wordsDataRange, LongArray documents) {
logger.info("Starting test 4"); logger.info("Starting test 4");
for (long i = 1; i < wordsDataRange.size(); i+=2) { for (long i = 1; i < wordsDataRange.size(); i+=2) {
var docsBTreeReader = new BTreeReader(documents, ReverseIndexParameters.docsBTreeContext, wordsDataRange.get(i)); var docsBTreeReader = new BTreeReader(documents, ReverseIndexParameters.fullDocsBTreeContext, wordsDataRange.get(i));
var header = docsBTreeReader.getHeader(); var header = docsBTreeReader.getHeader();
var docRange = documents.range(header.dataOffsetLongs(), header.dataOffsetLongs() + header.numEntries() * 2L); var docRange = documents.range(header.dataOffsetLongs(), header.dataOffsetLongs() + header.numEntries() * 2L);
for (int j = 0; j < docRange.size(); j+=2) { for (int j = 0; j < docRange.size(); j+=2) {
@ -84,7 +84,7 @@ public class ReverseIndexSelfTest {
public static void runSelfTest6(LongArray wordsDataRange, LongArray documents) { public static void runSelfTest6(LongArray wordsDataRange, LongArray documents) {
logger.info("Starting test 6"); logger.info("Starting test 6");
for (long i = 1; i < wordsDataRange.size(); i+=2) { for (long i = 1; i < wordsDataRange.size(); i+=2) {
var docsBTreeReader = new BTreeReader(documents, ReverseIndexParameters.docsBTreeContext, wordsDataRange.get(i)); var docsBTreeReader = new BTreeReader(documents, ReverseIndexParameters.fullDocsBTreeContext, wordsDataRange.get(i));
var header = docsBTreeReader.getHeader(); var header = docsBTreeReader.getHeader();
var docRange = documents.range(header.dataOffsetLongs(), header.dataOffsetLongs() + header.numEntries() * 2L); var docRange = documents.range(header.dataOffsetLongs(), header.dataOffsetLongs() + header.numEntries() * 2L);
Long prev = null; Long prev = null;

View File

@ -82,7 +82,7 @@ public class FullPreindex {
// Estimate the size of the docs index data // Estimate the size of the docs index data
offsets.transformEach(0, offsets.size(), new CountToOffsetTransformer(2)); offsets.transformEach(0, offsets.size(), new CountToOffsetTransformer(2));
IndexSizeEstimator sizeEstimator = new IndexSizeEstimator(ReverseIndexParameters.docsBTreeContext, 2); IndexSizeEstimator sizeEstimator = new IndexSizeEstimator(ReverseIndexParameters.fullDocsBTreeContext, 2);
offsets.fold(0, 0, offsets.size(), sizeEstimator); offsets.fold(0, 0, offsets.size(), sizeEstimator);
// Write the docs file // Write the docs file
@ -90,7 +90,7 @@ public class FullPreindex {
try (var intermediateDocChannel = documents.createDocumentsFileChannel()) { try (var intermediateDocChannel = documents.createDocumentsFileChannel()) {
offsets.transformEachIO(0, offsets.size(), offsets.transformEachIO(0, offsets.size(),
new FullIndexBTreeTransformer(finalDocs, 2, new FullIndexBTreeTransformer(finalDocs, 2,
ReverseIndexParameters.docsBTreeContext, ReverseIndexParameters.fullDocsBTreeContext,
intermediateDocChannel)); intermediateDocChannel));
intermediateDocChannel.force(false); intermediateDocChannel.force(false);
} }

View File

@ -20,7 +20,7 @@ import java.util.concurrent.Executors;
import java.util.concurrent.TimeUnit; import java.util.concurrent.TimeUnit;
/** A LongArray with document data, segmented according to /** A LongArray with document data, segmented according to
* the associated ReversePreindexWordSegments data * the associated FullPreindexWordSegments data
*/ */
public class FullPreindexDocuments { public class FullPreindexDocuments {
public final LongArray documents; public final LongArray documents;

View File

@ -55,8 +55,7 @@ public class PrioIndexConstructor {
} }
try (var heartbeat = processHeartbeat.createProcessTaskHeartbeat(CreateReverseIndexSteps.class, processName); try (var heartbeat = processHeartbeat.createProcessTaskHeartbeat(CreateReverseIndexSteps.class, processName);
var preindexHeartbeat = processHeartbeat.createAdHocTaskHeartbeat("constructPreindexes"); var preindexHeartbeat = processHeartbeat.createAdHocTaskHeartbeat("constructPreindexes")
var posConstructor = new PositionsFileConstructor(outputFilePositions)
) { ) {
heartbeat.progress(CreateReverseIndexSteps.CONSTRUCT); heartbeat.progress(CreateReverseIndexSteps.CONSTRUCT);
@ -66,7 +65,7 @@ public class PrioIndexConstructor {
.parallelStream() .parallelStream()
.map(in -> { .map(in -> {
preindexHeartbeat.progress("PREINDEX/MERGE", progress.incrementAndGet(), inputs.size()); preindexHeartbeat.progress("PREINDEX/MERGE", progress.incrementAndGet(), inputs.size());
return construct(in, posConstructor); return construct(in);
}) })
.reduce(this::merge) .reduce(this::merge)
.ifPresent((index) -> { .ifPresent((index) -> {
@ -80,9 +79,9 @@ public class PrioIndexConstructor {
} }
@SneakyThrows @SneakyThrows
private PrioPreindexReference construct(Path input, PositionsFileConstructor positionsFileConstructor) { private PrioPreindexReference construct(Path input) {
return PrioPreindex return PrioPreindex
.constructPreindex(readerSource.construct(input), positionsFileConstructor, docIdRewriter, tmpDir) .constructPreindex(readerSource.construct(input), docIdRewriter, tmpDir)
.closeToReference(); .closeToReference();
} }

View File

@ -7,7 +7,6 @@ import nu.marginalia.index.ReverseIndexParameters;
import nu.marginalia.index.construction.CountToOffsetTransformer; import nu.marginalia.index.construction.CountToOffsetTransformer;
import nu.marginalia.index.construction.DocIdRewriter; import nu.marginalia.index.construction.DocIdRewriter;
import nu.marginalia.index.construction.IndexSizeEstimator; import nu.marginalia.index.construction.IndexSizeEstimator;
import nu.marginalia.index.construction.PositionsFileConstructor;
import nu.marginalia.index.journal.reader.IndexJournalReader; import nu.marginalia.index.journal.reader.IndexJournalReader;
import org.slf4j.Logger; import org.slf4j.Logger;
import org.slf4j.LoggerFactory; import org.slf4j.LoggerFactory;
@ -44,7 +43,6 @@ public class PrioPreindex {
* will have randomly assigned names. * will have randomly assigned names.
*/ */
public static PrioPreindex constructPreindex(IndexJournalReader reader, public static PrioPreindex constructPreindex(IndexJournalReader reader,
PositionsFileConstructor positionsFileConstructor,
DocIdRewriter docIdRewriter, DocIdRewriter docIdRewriter,
Path workDir) throws IOException Path workDir) throws IOException
{ {
@ -53,7 +51,7 @@ public class PrioPreindex {
Path docsFile = Files.createTempFile(workDir, "docs", ".dat"); Path docsFile = Files.createTempFile(workDir, "docs", ".dat");
var segments = PrioPreindexWordSegments.construct(reader, segmentWordsFile, segmentCountsFile); var segments = PrioPreindexWordSegments.construct(reader, segmentWordsFile, segmentCountsFile);
var docs = PrioPreindexDocuments.construct(docsFile, workDir, reader, docIdRewriter, positionsFileConstructor, segments); var docs = PrioPreindexDocuments.construct(docsFile, workDir, reader, docIdRewriter, segments);
return new PrioPreindex(segments, docs); return new PrioPreindex(segments, docs);
} }
@ -81,16 +79,16 @@ public class PrioPreindex {
Files.deleteIfExists(outputFileWords); Files.deleteIfExists(outputFileWords);
// Estimate the size of the docs index data // Estimate the size of the docs index data
offsets.transformEach(0, offsets.size(), new CountToOffsetTransformer(2)); offsets.transformEach(0, offsets.size(), new CountToOffsetTransformer(1));
IndexSizeEstimator sizeEstimator = new IndexSizeEstimator(ReverseIndexParameters.docsBTreeContext, 2); IndexSizeEstimator sizeEstimator = new IndexSizeEstimator(ReverseIndexParameters.prioDocsBTreeContext, 1);
offsets.fold(0, 0, offsets.size(), sizeEstimator); offsets.fold(0, 0, offsets.size(), sizeEstimator);
// Write the docs file // Write the docs file
LongArray finalDocs = LongArrayFactory.mmapForWritingConfined(outputFileDocs, sizeEstimator.size); LongArray finalDocs = LongArrayFactory.mmapForWritingConfined(outputFileDocs, sizeEstimator.size);
try (var intermediateDocChannel = documents.createDocumentsFileChannel()) { try (var intermediateDocChannel = documents.createDocumentsFileChannel()) {
offsets.transformEachIO(0, offsets.size(), offsets.transformEachIO(0, offsets.size(),
new PrioIndexBTreeTransformer(finalDocs, 2, new PrioIndexBTreeTransformer(finalDocs, 1,
ReverseIndexParameters.docsBTreeContext, ReverseIndexParameters.prioDocsBTreeContext,
intermediateDocChannel)); intermediateDocChannel));
intermediateDocChannel.force(false); intermediateDocChannel.force(false);
} }
@ -137,9 +135,9 @@ public class PrioPreindex {
PrioPreindexWordSegments mergingSegment = PrioPreindexWordSegments mergingSegment =
createMergedSegmentWordFile(destDir, left.segments, right.segments); createMergedSegmentWordFile(destDir, left.segments, right.segments);
var mergingIter = mergingSegment.constructionIterator(2); var mergingIter = mergingSegment.constructionIterator(1);
var leftIter = left.segments.iterator(2); var leftIter = left.segments.iterator(1);
var rightIter = right.segments.iterator(2); var rightIter = right.segments.iterator(1);
Path docsFile = Files.createTempFile(destDir, "docs", ".dat"); Path docsFile = Files.createTempFile(destDir, "docs", ".dat");
@ -200,7 +198,7 @@ public class PrioPreindex {
// duplicates in the data, so we need to shrink it to the actual size we wrote. // duplicates in the data, so we need to shrink it to the actual size we wrote.
mergedDocuments = shrinkMergedDocuments(mergedDocuments, mergedDocuments = shrinkMergedDocuments(mergedDocuments,
docsFile, 2 * mergingSegment.totalSize()); docsFile, mergingSegment.totalSize());
return new PrioPreindex( return new PrioPreindex(
mergingSegment, mergingSegment,
@ -274,8 +272,7 @@ public class PrioPreindex {
leftIter.startOffset, leftIter.endOffset, leftIter.startOffset, leftIter.endOffset,
rightIter.startOffset, rightIter.endOffset); rightIter.startOffset, rightIter.endOffset);
long distinct = segSize / 2; destIter.putNext(segSize);
destIter.putNext(distinct);
leftIter.next(); leftIter.next();
rightIter.next(); rightIter.next();
} }
@ -297,7 +294,7 @@ public class PrioPreindex {
mergingIter.startOffset, mergingIter.startOffset,
end); end);
boolean putNext = mergingIter.putNext(size / 2); boolean putNext = mergingIter.putNext(size);
boolean iterNext = sourceIter.next(); boolean iterNext = sourceIter.next();
if (!putNext && iterNext) if (!putNext && iterNext)

View File

@ -4,7 +4,6 @@ import lombok.SneakyThrows;
import nu.marginalia.array.LongArray; import nu.marginalia.array.LongArray;
import nu.marginalia.array.LongArrayFactory; import nu.marginalia.array.LongArrayFactory;
import nu.marginalia.index.construction.DocIdRewriter; import nu.marginalia.index.construction.DocIdRewriter;
import nu.marginalia.index.construction.PositionsFileConstructor;
import nu.marginalia.index.journal.reader.IndexJournalReader; import nu.marginalia.index.journal.reader.IndexJournalReader;
import nu.marginalia.rwf.RandomFileAssembler; import nu.marginalia.rwf.RandomFileAssembler;
import org.slf4j.Logger; import org.slf4j.Logger;
@ -20,13 +19,12 @@ import java.util.concurrent.Executors;
import java.util.concurrent.TimeUnit; import java.util.concurrent.TimeUnit;
/** A LongArray with document data, segmented according to /** A LongArray with document data, segmented according to
* the associated ReversePreindexWordSegments data * the associated FullPreindexWordSegments data
*/ */
public class PrioPreindexDocuments { public class PrioPreindexDocuments {
public final LongArray documents; public final LongArray documents;
private static PositionsFileConstructor positionsFileConstructor; private static final int RECORD_SIZE_LONGS = 1;
private static final int RECORD_SIZE_LONGS = 2;
private static final Logger logger = LoggerFactory.getLogger(PrioPreindexDocuments.class); private static final Logger logger = LoggerFactory.getLogger(PrioPreindexDocuments.class);
public final Path file; public final Path file;
@ -41,9 +39,7 @@ public class PrioPreindexDocuments {
Path workDir, Path workDir,
IndexJournalReader reader, IndexJournalReader reader,
DocIdRewriter docIdRewriter, DocIdRewriter docIdRewriter,
PositionsFileConstructor positionsFileConstructor,
PrioPreindexWordSegments segments) throws IOException { PrioPreindexWordSegments segments) throws IOException {
PrioPreindexDocuments.positionsFileConstructor = positionsFileConstructor;
createUnsortedDocsFile(docsFile, workDir, reader, segments, docIdRewriter); createUnsortedDocsFile(docsFile, workDir, reader, segments, docIdRewriter);
@ -88,11 +84,7 @@ public class PrioPreindexDocuments {
long offset = offsetMap.addTo(termId, RECORD_SIZE_LONGS); long offset = offsetMap.addTo(termId, RECORD_SIZE_LONGS);
// write position data to the positions file and get the offset assembly.put(offset, rankEncodedId);
long encodedPosOffset = positionsFileConstructor.add((byte) termData.metadata(), termData.positionsBuffer());
assembly.put(offset + 0, rankEncodedId);
assembly.put(offset + 1, encodedPosOffset);
} }
} }
@ -112,11 +104,10 @@ public class PrioPreindexDocuments {
long iterEnd = iter.endOffset; long iterEnd = iter.endOffset;
if (iter.size() < 1024) { if (iter.size() < 1024) {
docsFileMap.quickSortN(RECORD_SIZE_LONGS, iterStart, iterEnd); docsFileMap.sort(iterStart, iterEnd);
} }
else { else {
sortingWorkers.execute(() -> sortingWorkers.execute(() -> docsFileMap.sort(iterStart, iterEnd));
docsFileMap.quickSortN(RECORD_SIZE_LONGS, iterStart, iterEnd));
} }
} }

View File

@ -22,7 +22,7 @@ import java.util.List;
import static nu.marginalia.index.construction.full.TestJournalFactory.wm; import static nu.marginalia.index.construction.full.TestJournalFactory.wm;
import static org.junit.jupiter.api.Assertions.*; import static org.junit.jupiter.api.Assertions.*;
class ReverseIndexReaderTest { class FullReverseIndexReaderTest {
TestJournalFactory journalFactory; TestJournalFactory journalFactory;
Path tempDir; Path tempDir;
@ -82,7 +82,7 @@ class ReverseIndexReaderTest {
} }
private long[] readEntries(ReverseIndexReader reader, long wordId) { private long[] readEntries(FullReverseIndexReader reader, long wordId) {
var es = reader.documents(wordId); var es = reader.documents(wordId);
assertTrue(es.hasMore()); assertTrue(es.hasMore());
LongQueryBuffer buffer = new LongQueryBuffer(4); LongQueryBuffer buffer = new LongQueryBuffer(4);
@ -91,7 +91,7 @@ class ReverseIndexReaderTest {
return buffer.copyData(); return buffer.copyData();
} }
private ReverseIndexReader createIndex(EntryDataWithWordMeta... scenario) throws IOException { private FullReverseIndexReader createIndex(EntryDataWithWordMeta... scenario) throws IOException {
var reader = journalFactory.createReader(scenario); var reader = journalFactory.createReader(scenario);
Path posFile = tempDir.resolve("positions.dat"); Path posFile = tempDir.resolve("positions.dat");
@ -106,7 +106,7 @@ class ReverseIndexReaderTest {
preindex.delete(); preindex.delete();
} }
return new ReverseIndexReader("test", wordsFile, docsFile, new PositionsFileReader(posFile)); return new FullReverseIndexReader("test", wordsFile, docsFile, new PositionsFileReader(posFile));
} }
} }

View File

@ -26,7 +26,7 @@ public class ReverseIndexDebugTest {
long wordOffset = wordsBTreeReader.findEntry(problemWord); long wordOffset = wordsBTreeReader.findEntry(problemWord);
assertTrue(wordOffset >= 0); assertTrue(wordOffset >= 0);
var docsReader = new BTreeReader(documents, ReverseIndexParameters.docsBTreeContext, wordOffset); var docsReader = new BTreeReader(documents, ReverseIndexParameters.prioDocsBTreeContext, wordOffset);
// We find problemDoc even though it doesn't exist in the document range // We find problemDoc even though it doesn't exist in the document range
long docOffset = docsReader.findEntry(problemDoc); long docOffset = docsReader.findEntry(problemDoc);

View File

@ -58,7 +58,7 @@ public class TestJournalFactory {
return new WordWithMeta(wordId, meta, GammaCodedSequence.generate(ByteBuffer.allocate(128), positions)); return new WordWithMeta(wordId, meta, GammaCodedSequence.generate(ByteBuffer.allocate(128), positions));
} }
IndexJournalReader createReader(EntryData... entries) throws IOException { public IndexJournalReader createReader(EntryData... entries) throws IOException {
Path jf = Files.createTempFile(tempDir, "journal", ".dat"); Path jf = Files.createTempFile(tempDir, "journal", ".dat");
var writer = new IndexJournalWriterSingleFileImpl(jf); var writer = new IndexJournalWriterSingleFileImpl(jf);

View File

@ -0,0 +1,86 @@
package nu.marginalia.index.construction.prio;
import nu.marginalia.array.page.LongQueryBuffer;
import nu.marginalia.index.PrioReverseIndexReader;
import nu.marginalia.index.construction.DocIdRewriter;
import nu.marginalia.index.construction.full.TestJournalFactory;
import org.junit.jupiter.api.AfterEach;
import org.junit.jupiter.api.BeforeEach;
import org.junit.jupiter.api.Test;
import java.io.IOException;
import java.nio.file.Files;
import java.nio.file.Path;
import java.util.ArrayList;
import java.util.List;
import static nu.marginalia.index.construction.full.TestJournalFactory.*;
import static nu.marginalia.index.construction.full.TestJournalFactory.wm;
import static org.junit.jupiter.api.Assertions.assertEquals;
import static org.junit.jupiter.api.Assertions.assertTrue;
class FullPreindexTest {
Path countsFile;
Path wordsIdFile;
Path docsFile;
Path tempDir;
Path positionsFile;
TestJournalFactory journalFactory;
@BeforeEach
public void setUp() throws IOException {
journalFactory = new TestJournalFactory();
countsFile = Files.createTempFile("counts", ".dat");
wordsIdFile = Files.createTempFile("words", ".dat");
docsFile = Files.createTempFile("docs", ".dat");
tempDir = Files.createTempDirectory("sort");
positionsFile = tempDir.resolve("positions.dat");
}
@AfterEach
public void tearDown() throws IOException {
journalFactory.clear();
Files.deleteIfExists(countsFile);
Files.deleteIfExists(wordsIdFile);
Files.deleteIfExists(positionsFile);
Files.deleteIfExists(docsFile);
List<Path> contents = new ArrayList<>();
Files.list(tempDir).forEach(contents::add);
for (var tempFile : contents) {
Files.delete(tempFile);
}
Files.delete(tempDir);
}
@Test
public void testFinalizeSimple() throws IOException {
var journalReader = journalFactory.createReader(
new EntryDataWithWordMeta(100, 101, wm(50, 51)),
new EntryDataWithWordMeta(104, 101, wm(50, 52))
);
var preindex = PrioPreindex.constructPreindex(journalReader, DocIdRewriter.identity(), tempDir);
preindex.finalizeIndex(tempDir.resolve( "docs.dat"), tempDir.resolve("words.dat"));
preindex.delete();
Path wordsFile = tempDir.resolve("words.dat");
Path docsFile = tempDir.resolve("docs.dat");
assertTrue(Files.exists(wordsFile));
assertTrue(Files.exists(docsFile));
var indexReader = new PrioReverseIndexReader("test", wordsFile, docsFile);
var entrySource = indexReader.documents(50);
var lqb = new LongQueryBuffer(32);
entrySource.read(lqb);
assertEquals(2, lqb.size());
assertEquals(100, lqb.copyData()[0]);
assertEquals(104, lqb.copyData()[1]);
}
}

View File

@ -38,19 +38,18 @@ public class IndexFactory {
return IndexLocations.getSearchSetsPath(fileStorageService); return IndexLocations.getSearchSetsPath(fileStorageService);
} }
public ReverseIndexReader getReverseIndexReader() throws IOException { public FullReverseIndexReader getReverseIndexReader() throws IOException {
return new ReverseIndexReader("full", return new FullReverseIndexReader("full",
ReverseIndexFullFileNames.resolve(liveStorage, ReverseIndexFullFileNames.FileIdentifier.WORDS, ReverseIndexFullFileNames.FileVersion.CURRENT), ReverseIndexFullFileNames.resolve(liveStorage, ReverseIndexFullFileNames.FileIdentifier.WORDS, ReverseIndexFullFileNames.FileVersion.CURRENT),
ReverseIndexFullFileNames.resolve(liveStorage, ReverseIndexFullFileNames.FileIdentifier.DOCS, ReverseIndexFullFileNames.FileVersion.CURRENT), ReverseIndexFullFileNames.resolve(liveStorage, ReverseIndexFullFileNames.FileIdentifier.DOCS, ReverseIndexFullFileNames.FileVersion.CURRENT),
new PositionsFileReader(ReverseIndexFullFileNames.resolve(liveStorage, ReverseIndexFullFileNames.FileIdentifier.POSITIONS, ReverseIndexFullFileNames.FileVersion.CURRENT)) new PositionsFileReader(ReverseIndexFullFileNames.resolve(liveStorage, ReverseIndexFullFileNames.FileIdentifier.POSITIONS, ReverseIndexFullFileNames.FileVersion.CURRENT))
); );
} }
public ReverseIndexReader getReverseIndexPrioReader() throws IOException { public PrioReverseIndexReader getReverseIndexPrioReader() throws IOException {
return new ReverseIndexReader("prio", return new PrioReverseIndexReader("prio",
ReverseIndexPrioFileNames.resolve(liveStorage, ReverseIndexPrioFileNames.FileIdentifier.WORDS, ReverseIndexPrioFileNames.FileVersion.CURRENT), ReverseIndexPrioFileNames.resolve(liveStorage, ReverseIndexPrioFileNames.FileIdentifier.WORDS, ReverseIndexPrioFileNames.FileVersion.CURRENT),
ReverseIndexPrioFileNames.resolve(liveStorage, ReverseIndexPrioFileNames.FileIdentifier.DOCS, ReverseIndexPrioFileNames.FileVersion.CURRENT), ReverseIndexPrioFileNames.resolve(liveStorage, ReverseIndexPrioFileNames.FileIdentifier.DOCS, ReverseIndexPrioFileNames.FileVersion.CURRENT)
null
); );
} }

View File

@ -5,7 +5,8 @@ import it.unimi.dsi.fastutil.longs.LongList;
import it.unimi.dsi.fastutil.longs.LongOpenHashSet; import it.unimi.dsi.fastutil.longs.LongOpenHashSet;
import it.unimi.dsi.fastutil.longs.LongSet; import it.unimi.dsi.fastutil.longs.LongSet;
import nu.marginalia.api.searchquery.model.compiled.aggregate.CompiledQueryAggregates; import nu.marginalia.api.searchquery.model.compiled.aggregate.CompiledQueryAggregates;
import nu.marginalia.index.ReverseIndexReader; import nu.marginalia.index.FullReverseIndexReader;
import nu.marginalia.index.PrioReverseIndexReader;
import nu.marginalia.index.forward.ForwardIndexReader; import nu.marginalia.index.forward.ForwardIndexReader;
import nu.marginalia.index.model.QueryParams; import nu.marginalia.index.model.QueryParams;
import nu.marginalia.index.model.SearchTerms; import nu.marginalia.index.model.SearchTerms;
@ -38,30 +39,25 @@ public class CombinedIndexReader {
private final Logger logger = LoggerFactory.getLogger(getClass()); private final Logger logger = LoggerFactory.getLogger(getClass());
private final ForwardIndexReader forwardIndexReader; private final ForwardIndexReader forwardIndexReader;
private final ReverseIndexReader reverseIndexFullReader; private final FullReverseIndexReader reverseIndexFullReader;
private final ReverseIndexReader reverseIndexPriorityReader; private final PrioReverseIndexReader reverseIndexPriorityReader;
public CombinedIndexReader(ForwardIndexReader forwardIndexReader, public CombinedIndexReader(ForwardIndexReader forwardIndexReader,
ReverseIndexReader reverseIndexFullReader, FullReverseIndexReader reverseIndexFullReader,
ReverseIndexReader reverseIndexPriorityReader) { PrioReverseIndexReader reverseIndexPriorityReader) {
this.forwardIndexReader = forwardIndexReader; this.forwardIndexReader = forwardIndexReader;
this.reverseIndexFullReader = reverseIndexFullReader; this.reverseIndexFullReader = reverseIndexFullReader;
this.reverseIndexPriorityReader = reverseIndexPriorityReader; this.reverseIndexPriorityReader = reverseIndexPriorityReader;
} }
public IndexQueryBuilderImpl newQueryBuilder(IndexQuery query) { public IndexQueryBuilderImpl newQueryBuilder(IndexQuery query) {
return new IndexQueryBuilderImpl(reverseIndexFullReader, reverseIndexPriorityReader, query); return new IndexQueryBuilderImpl(reverseIndexFullReader, query);
} }
public QueryFilterStepIf hasWordFull(long termId) { public QueryFilterStepIf hasWordFull(long termId) {
return reverseIndexFullReader.also(termId); return reverseIndexFullReader.also(termId);
} }
public QueryFilterStepIf hasWordPrio(long termId) {
return reverseIndexPriorityReader.also(termId);
}
/** Creates a query builder for terms in the priority index */ /** Creates a query builder for terms in the priority index */
public IndexQueryBuilder findPriorityWord(long wordId) { public IndexQueryBuilder findPriorityWord(long wordId) {
return newQueryBuilder(new IndexQuery(reverseIndexPriorityReader.documents(wordId))) return newQueryBuilder(new IndexQuery(reverseIndexPriorityReader.documents(wordId)))
@ -124,7 +120,7 @@ public class CombinedIndexReader {
if (paths.size() < 4) { if (paths.size() < 4) {
var prioHead = findPriorityWord(elements.getLong(0)); var prioHead = findPriorityWord(elements.getLong(0));
for (int i = 1; i < elements.size(); i++) { for (int i = 1; i < elements.size(); i++) {
prioHead.addInclusionFilter(hasWordPrio(elements.getLong(i))); prioHead.addInclusionFilter(hasWordFull(elements.getLong(i)));
} }
queryHeads.add(prioHead); queryHeads.add(prioHead);
} }

View File

@ -2,7 +2,8 @@ package nu.marginalia.index.index;
import java.util.List; import java.util.List;
import gnu.trove.set.hash.TLongHashSet; import gnu.trove.set.hash.TLongHashSet;
import nu.marginalia.index.ReverseIndexReader; import nu.marginalia.index.FullReverseIndexReader;
import nu.marginalia.index.PrioReverseIndexReader;
import nu.marginalia.index.query.IndexQuery; import nu.marginalia.index.query.IndexQuery;
import nu.marginalia.index.query.IndexQueryBuilder; import nu.marginalia.index.query.IndexQueryBuilder;
import nu.marginalia.index.query.filter.QueryFilterAnyOf; import nu.marginalia.index.query.filter.QueryFilterAnyOf;
@ -10,8 +11,7 @@ import nu.marginalia.index.query.filter.QueryFilterStepIf;
public class IndexQueryBuilderImpl implements IndexQueryBuilder { public class IndexQueryBuilderImpl implements IndexQueryBuilder {
private final IndexQuery query; private final IndexQuery query;
private final ReverseIndexReader reverseIndexFullReader; private final FullReverseIndexReader reverseIndexFullReader;
private final ReverseIndexReader reverseIndexPrioReader;
/* Keep track of already added include terms to avoid redundant checks. /* Keep track of already added include terms to avoid redundant checks.
* *
@ -21,13 +21,10 @@ public class IndexQueryBuilderImpl implements IndexQueryBuilder {
* */ * */
private final TLongHashSet alreadyConsideredTerms = new TLongHashSet(); private final TLongHashSet alreadyConsideredTerms = new TLongHashSet();
IndexQueryBuilderImpl(ReverseIndexReader reverseIndexFullReader, IndexQueryBuilderImpl(FullReverseIndexReader reverseIndexFullReader, IndexQuery query)
ReverseIndexReader reverseIndexPrioReader,
IndexQuery query)
{ {
this.query = query; this.query = query;
this.reverseIndexFullReader = reverseIndexFullReader; this.reverseIndexFullReader = reverseIndexFullReader;
this.reverseIndexPrioReader = reverseIndexPrioReader;
} }
public IndexQueryBuilder withSourceTerms(long... sourceTerms) { public IndexQueryBuilder withSourceTerms(long... sourceTerms) {

View File

@ -18,6 +18,7 @@ import nu.marginalia.index.IndexGrpcService;
import nu.marginalia.index.ReverseIndexFullFileNames; import nu.marginalia.index.ReverseIndexFullFileNames;
import nu.marginalia.index.ReverseIndexPrioFileNames; import nu.marginalia.index.ReverseIndexPrioFileNames;
import nu.marginalia.index.construction.full.FullIndexConstructor; import nu.marginalia.index.construction.full.FullIndexConstructor;
import nu.marginalia.index.construction.prio.PrioIndexConstructor;
import nu.marginalia.index.domainrankings.DomainRankings; import nu.marginalia.index.domainrankings.DomainRankings;
import nu.marginalia.index.forward.ForwardIndexConverter; import nu.marginalia.index.forward.ForwardIndexConverter;
import nu.marginalia.index.forward.ForwardIndexFileNames; import nu.marginalia.index.forward.ForwardIndexFileNames;
@ -269,7 +270,7 @@ public class IntegrationTest {
// important to the document. This filter will act on the encoded {@see WordMetadata} // important to the document. This filter will act on the encoded {@see WordMetadata}
LongPredicate wordMetaFilter = getPriorityIndexWordMetaFilter(); LongPredicate wordMetaFilter = getPriorityIndexWordMetaFilter();
var constructor = new FullIndexConstructor( var constructor = new PrioIndexConstructor(
outputFileDocs, outputFileDocs,
outputFileWords, outputFileWords,
outputFilePositions, outputFilePositions,