mirror of
https://github.com/MarginaliaSearch/MarginaliaSearch.git
synced 2025-02-24 05:18:58 +00:00
(index-reverse) Simplify priority index
* Do not emit a documents file * Do not interlace metadata or offsets with doc ids
This commit is contained in:
parent
85c99ae808
commit
fa36689597
@ -21,7 +21,7 @@ import java.nio.file.Files;
|
|||||||
import java.nio.file.Path;
|
import java.nio.file.Path;
|
||||||
import java.util.concurrent.Executors;
|
import java.util.concurrent.Executors;
|
||||||
|
|
||||||
public class ReverseIndexReader {
|
public class FullReverseIndexReader {
|
||||||
private final LongArray words;
|
private final LongArray words;
|
||||||
private final LongArray documents;
|
private final LongArray documents;
|
||||||
private final long wordsDataOffset;
|
private final long wordsDataOffset;
|
||||||
@ -31,7 +31,7 @@ public class ReverseIndexReader {
|
|||||||
|
|
||||||
private final PositionsFileReader positionsFileReader;
|
private final PositionsFileReader positionsFileReader;
|
||||||
|
|
||||||
public ReverseIndexReader(String name,
|
public FullReverseIndexReader(String name,
|
||||||
Path words,
|
Path words,
|
||||||
Path documents,
|
Path documents,
|
||||||
PositionsFileReader positionsFileReader) throws IOException {
|
PositionsFileReader positionsFileReader) throws IOException {
|
||||||
@ -138,7 +138,7 @@ public class ReverseIndexReader {
|
|||||||
private BTreeReader createReaderNew(long offset) {
|
private BTreeReader createReaderNew(long offset) {
|
||||||
return new BTreeReader(
|
return new BTreeReader(
|
||||||
documents,
|
documents,
|
||||||
ReverseIndexParameters.docsBTreeContext,
|
ReverseIndexParameters.fullDocsBTreeContext,
|
||||||
offset);
|
offset);
|
||||||
}
|
}
|
||||||
|
|
@ -0,0 +1,99 @@
|
|||||||
|
package nu.marginalia.index;
|
||||||
|
|
||||||
|
import nu.marginalia.array.LongArray;
|
||||||
|
import nu.marginalia.array.LongArrayFactory;
|
||||||
|
import nu.marginalia.btree.BTreeReader;
|
||||||
|
import nu.marginalia.index.query.EmptyEntrySource;
|
||||||
|
import nu.marginalia.index.query.EntrySource;
|
||||||
|
import org.slf4j.Logger;
|
||||||
|
import org.slf4j.LoggerFactory;
|
||||||
|
|
||||||
|
import java.io.IOException;
|
||||||
|
import java.nio.file.Files;
|
||||||
|
import java.nio.file.Path;
|
||||||
|
|
||||||
|
public class PrioReverseIndexReader {
|
||||||
|
private final LongArray words;
|
||||||
|
private final LongArray documents;
|
||||||
|
private final long wordsDataOffset;
|
||||||
|
private final Logger logger = LoggerFactory.getLogger(getClass());
|
||||||
|
private final BTreeReader wordsBTreeReader;
|
||||||
|
private final String name;
|
||||||
|
|
||||||
|
public PrioReverseIndexReader(String name,
|
||||||
|
Path words,
|
||||||
|
Path documents) throws IOException {
|
||||||
|
this.name = name;
|
||||||
|
|
||||||
|
if (!Files.exists(words) || !Files.exists(documents)) {
|
||||||
|
this.words = null;
|
||||||
|
this.documents = null;
|
||||||
|
this.wordsBTreeReader = null;
|
||||||
|
this.wordsDataOffset = -1;
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
|
||||||
|
logger.info("Switching reverse index");
|
||||||
|
|
||||||
|
this.words = LongArrayFactory.mmapForReadingShared(words);
|
||||||
|
this.documents = LongArrayFactory.mmapForReadingShared(documents);
|
||||||
|
|
||||||
|
wordsBTreeReader = new BTreeReader(this.words, ReverseIndexParameters.wordsBTreeContext, 0);
|
||||||
|
wordsDataOffset = wordsBTreeReader.getHeader().dataOffsetLongs();
|
||||||
|
|
||||||
|
}
|
||||||
|
|
||||||
|
/** Calculate the offset of the word in the documents.
|
||||||
|
* If the return-value is negative, the term does not exist
|
||||||
|
* in the index.
|
||||||
|
*/
|
||||||
|
long wordOffset(long termId) {
|
||||||
|
long idx = wordsBTreeReader.findEntry(termId);
|
||||||
|
|
||||||
|
if (idx < 0)
|
||||||
|
return -1L;
|
||||||
|
|
||||||
|
return words.get(wordsDataOffset + idx + 1);
|
||||||
|
}
|
||||||
|
|
||||||
|
public EntrySource documents(long termId) {
|
||||||
|
if (null == words) {
|
||||||
|
logger.warn("Reverse index is not ready, dropping query");
|
||||||
|
return new EmptyEntrySource();
|
||||||
|
}
|
||||||
|
|
||||||
|
long offset = wordOffset(termId);
|
||||||
|
|
||||||
|
if (offset < 0) // No documents
|
||||||
|
return new EmptyEntrySource();
|
||||||
|
|
||||||
|
return new ReverseIndexEntrySource(name, createReaderNew(offset), 1, termId);
|
||||||
|
}
|
||||||
|
|
||||||
|
/** Return the number of documents with the termId in the index */
|
||||||
|
public int numDocuments(long termId) {
|
||||||
|
long offset = wordOffset(termId);
|
||||||
|
|
||||||
|
if (offset < 0)
|
||||||
|
return 0;
|
||||||
|
|
||||||
|
return createReaderNew(offset).numEntries();
|
||||||
|
}
|
||||||
|
|
||||||
|
/** Create a BTreeReader for the document offset associated with a termId */
|
||||||
|
private BTreeReader createReaderNew(long offset) {
|
||||||
|
return new BTreeReader(
|
||||||
|
documents,
|
||||||
|
ReverseIndexParameters.prioDocsBTreeContext,
|
||||||
|
offset);
|
||||||
|
}
|
||||||
|
|
||||||
|
public void close() {
|
||||||
|
if (documents != null)
|
||||||
|
documents.close();
|
||||||
|
|
||||||
|
if (words != null)
|
||||||
|
words.close();
|
||||||
|
}
|
||||||
|
|
||||||
|
}
|
@ -5,6 +5,7 @@ import nu.marginalia.btree.model.BTreeContext;
|
|||||||
|
|
||||||
public class ReverseIndexParameters
|
public class ReverseIndexParameters
|
||||||
{
|
{
|
||||||
public static final BTreeContext docsBTreeContext = new BTreeContext(5, 2, BTreeBlockSize.BS_2048);
|
public static final BTreeContext prioDocsBTreeContext = new BTreeContext(5, 1, BTreeBlockSize.BS_2048);
|
||||||
|
public static final BTreeContext fullDocsBTreeContext = new BTreeContext(5, 2, BTreeBlockSize.BS_2048);
|
||||||
public static final BTreeContext wordsBTreeContext = new BTreeContext(5, 2, BTreeBlockSize.BS_2048);
|
public static final BTreeContext wordsBTreeContext = new BTreeContext(5, 2, BTreeBlockSize.BS_2048);
|
||||||
}
|
}
|
||||||
|
@ -22,7 +22,7 @@ public class ReverseIndexSelfTest {
|
|||||||
public static void runSelfTest2(LongArray wordsDataRange, LongArray documents) {
|
public static void runSelfTest2(LongArray wordsDataRange, LongArray documents) {
|
||||||
logger.info("Starting test 2");
|
logger.info("Starting test 2");
|
||||||
for (long i = 1; i < wordsDataRange.size(); i+=2) {
|
for (long i = 1; i < wordsDataRange.size(); i+=2) {
|
||||||
var docsBTreeReader = new BTreeReader(documents, ReverseIndexParameters.docsBTreeContext, wordsDataRange.get(i));
|
var docsBTreeReader = new BTreeReader(documents, ReverseIndexParameters.fullDocsBTreeContext, wordsDataRange.get(i));
|
||||||
var header = docsBTreeReader.getHeader();
|
var header = docsBTreeReader.getHeader();
|
||||||
var docRange = documents.range(header.dataOffsetLongs(), header.dataOffsetLongs() + header.numEntries() * 2L);
|
var docRange = documents.range(header.dataOffsetLongs(), header.dataOffsetLongs() + header.numEntries() * 2L);
|
||||||
|
|
||||||
@ -49,7 +49,7 @@ public class ReverseIndexSelfTest {
|
|||||||
public static void runSelfTest4(LongArray wordsDataRange, LongArray documents) {
|
public static void runSelfTest4(LongArray wordsDataRange, LongArray documents) {
|
||||||
logger.info("Starting test 4");
|
logger.info("Starting test 4");
|
||||||
for (long i = 1; i < wordsDataRange.size(); i+=2) {
|
for (long i = 1; i < wordsDataRange.size(); i+=2) {
|
||||||
var docsBTreeReader = new BTreeReader(documents, ReverseIndexParameters.docsBTreeContext, wordsDataRange.get(i));
|
var docsBTreeReader = new BTreeReader(documents, ReverseIndexParameters.fullDocsBTreeContext, wordsDataRange.get(i));
|
||||||
var header = docsBTreeReader.getHeader();
|
var header = docsBTreeReader.getHeader();
|
||||||
var docRange = documents.range(header.dataOffsetLongs(), header.dataOffsetLongs() + header.numEntries() * 2L);
|
var docRange = documents.range(header.dataOffsetLongs(), header.dataOffsetLongs() + header.numEntries() * 2L);
|
||||||
for (int j = 0; j < docRange.size(); j+=2) {
|
for (int j = 0; j < docRange.size(); j+=2) {
|
||||||
@ -84,7 +84,7 @@ public class ReverseIndexSelfTest {
|
|||||||
public static void runSelfTest6(LongArray wordsDataRange, LongArray documents) {
|
public static void runSelfTest6(LongArray wordsDataRange, LongArray documents) {
|
||||||
logger.info("Starting test 6");
|
logger.info("Starting test 6");
|
||||||
for (long i = 1; i < wordsDataRange.size(); i+=2) {
|
for (long i = 1; i < wordsDataRange.size(); i+=2) {
|
||||||
var docsBTreeReader = new BTreeReader(documents, ReverseIndexParameters.docsBTreeContext, wordsDataRange.get(i));
|
var docsBTreeReader = new BTreeReader(documents, ReverseIndexParameters.fullDocsBTreeContext, wordsDataRange.get(i));
|
||||||
var header = docsBTreeReader.getHeader();
|
var header = docsBTreeReader.getHeader();
|
||||||
var docRange = documents.range(header.dataOffsetLongs(), header.dataOffsetLongs() + header.numEntries() * 2L);
|
var docRange = documents.range(header.dataOffsetLongs(), header.dataOffsetLongs() + header.numEntries() * 2L);
|
||||||
Long prev = null;
|
Long prev = null;
|
||||||
|
@ -82,7 +82,7 @@ public class FullPreindex {
|
|||||||
|
|
||||||
// Estimate the size of the docs index data
|
// Estimate the size of the docs index data
|
||||||
offsets.transformEach(0, offsets.size(), new CountToOffsetTransformer(2));
|
offsets.transformEach(0, offsets.size(), new CountToOffsetTransformer(2));
|
||||||
IndexSizeEstimator sizeEstimator = new IndexSizeEstimator(ReverseIndexParameters.docsBTreeContext, 2);
|
IndexSizeEstimator sizeEstimator = new IndexSizeEstimator(ReverseIndexParameters.fullDocsBTreeContext, 2);
|
||||||
offsets.fold(0, 0, offsets.size(), sizeEstimator);
|
offsets.fold(0, 0, offsets.size(), sizeEstimator);
|
||||||
|
|
||||||
// Write the docs file
|
// Write the docs file
|
||||||
@ -90,7 +90,7 @@ public class FullPreindex {
|
|||||||
try (var intermediateDocChannel = documents.createDocumentsFileChannel()) {
|
try (var intermediateDocChannel = documents.createDocumentsFileChannel()) {
|
||||||
offsets.transformEachIO(0, offsets.size(),
|
offsets.transformEachIO(0, offsets.size(),
|
||||||
new FullIndexBTreeTransformer(finalDocs, 2,
|
new FullIndexBTreeTransformer(finalDocs, 2,
|
||||||
ReverseIndexParameters.docsBTreeContext,
|
ReverseIndexParameters.fullDocsBTreeContext,
|
||||||
intermediateDocChannel));
|
intermediateDocChannel));
|
||||||
intermediateDocChannel.force(false);
|
intermediateDocChannel.force(false);
|
||||||
}
|
}
|
||||||
|
@ -20,7 +20,7 @@ import java.util.concurrent.Executors;
|
|||||||
import java.util.concurrent.TimeUnit;
|
import java.util.concurrent.TimeUnit;
|
||||||
|
|
||||||
/** A LongArray with document data, segmented according to
|
/** A LongArray with document data, segmented according to
|
||||||
* the associated ReversePreindexWordSegments data
|
* the associated FullPreindexWordSegments data
|
||||||
*/
|
*/
|
||||||
public class FullPreindexDocuments {
|
public class FullPreindexDocuments {
|
||||||
public final LongArray documents;
|
public final LongArray documents;
|
||||||
|
@ -55,8 +55,7 @@ public class PrioIndexConstructor {
|
|||||||
}
|
}
|
||||||
|
|
||||||
try (var heartbeat = processHeartbeat.createProcessTaskHeartbeat(CreateReverseIndexSteps.class, processName);
|
try (var heartbeat = processHeartbeat.createProcessTaskHeartbeat(CreateReverseIndexSteps.class, processName);
|
||||||
var preindexHeartbeat = processHeartbeat.createAdHocTaskHeartbeat("constructPreindexes");
|
var preindexHeartbeat = processHeartbeat.createAdHocTaskHeartbeat("constructPreindexes")
|
||||||
var posConstructor = new PositionsFileConstructor(outputFilePositions)
|
|
||||||
) {
|
) {
|
||||||
heartbeat.progress(CreateReverseIndexSteps.CONSTRUCT);
|
heartbeat.progress(CreateReverseIndexSteps.CONSTRUCT);
|
||||||
|
|
||||||
@ -66,7 +65,7 @@ public class PrioIndexConstructor {
|
|||||||
.parallelStream()
|
.parallelStream()
|
||||||
.map(in -> {
|
.map(in -> {
|
||||||
preindexHeartbeat.progress("PREINDEX/MERGE", progress.incrementAndGet(), inputs.size());
|
preindexHeartbeat.progress("PREINDEX/MERGE", progress.incrementAndGet(), inputs.size());
|
||||||
return construct(in, posConstructor);
|
return construct(in);
|
||||||
})
|
})
|
||||||
.reduce(this::merge)
|
.reduce(this::merge)
|
||||||
.ifPresent((index) -> {
|
.ifPresent((index) -> {
|
||||||
@ -80,9 +79,9 @@ public class PrioIndexConstructor {
|
|||||||
}
|
}
|
||||||
|
|
||||||
@SneakyThrows
|
@SneakyThrows
|
||||||
private PrioPreindexReference construct(Path input, PositionsFileConstructor positionsFileConstructor) {
|
private PrioPreindexReference construct(Path input) {
|
||||||
return PrioPreindex
|
return PrioPreindex
|
||||||
.constructPreindex(readerSource.construct(input), positionsFileConstructor, docIdRewriter, tmpDir)
|
.constructPreindex(readerSource.construct(input), docIdRewriter, tmpDir)
|
||||||
.closeToReference();
|
.closeToReference();
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -7,7 +7,6 @@ import nu.marginalia.index.ReverseIndexParameters;
|
|||||||
import nu.marginalia.index.construction.CountToOffsetTransformer;
|
import nu.marginalia.index.construction.CountToOffsetTransformer;
|
||||||
import nu.marginalia.index.construction.DocIdRewriter;
|
import nu.marginalia.index.construction.DocIdRewriter;
|
||||||
import nu.marginalia.index.construction.IndexSizeEstimator;
|
import nu.marginalia.index.construction.IndexSizeEstimator;
|
||||||
import nu.marginalia.index.construction.PositionsFileConstructor;
|
|
||||||
import nu.marginalia.index.journal.reader.IndexJournalReader;
|
import nu.marginalia.index.journal.reader.IndexJournalReader;
|
||||||
import org.slf4j.Logger;
|
import org.slf4j.Logger;
|
||||||
import org.slf4j.LoggerFactory;
|
import org.slf4j.LoggerFactory;
|
||||||
@ -44,7 +43,6 @@ public class PrioPreindex {
|
|||||||
* will have randomly assigned names.
|
* will have randomly assigned names.
|
||||||
*/
|
*/
|
||||||
public static PrioPreindex constructPreindex(IndexJournalReader reader,
|
public static PrioPreindex constructPreindex(IndexJournalReader reader,
|
||||||
PositionsFileConstructor positionsFileConstructor,
|
|
||||||
DocIdRewriter docIdRewriter,
|
DocIdRewriter docIdRewriter,
|
||||||
Path workDir) throws IOException
|
Path workDir) throws IOException
|
||||||
{
|
{
|
||||||
@ -53,7 +51,7 @@ public class PrioPreindex {
|
|||||||
Path docsFile = Files.createTempFile(workDir, "docs", ".dat");
|
Path docsFile = Files.createTempFile(workDir, "docs", ".dat");
|
||||||
|
|
||||||
var segments = PrioPreindexWordSegments.construct(reader, segmentWordsFile, segmentCountsFile);
|
var segments = PrioPreindexWordSegments.construct(reader, segmentWordsFile, segmentCountsFile);
|
||||||
var docs = PrioPreindexDocuments.construct(docsFile, workDir, reader, docIdRewriter, positionsFileConstructor, segments);
|
var docs = PrioPreindexDocuments.construct(docsFile, workDir, reader, docIdRewriter, segments);
|
||||||
return new PrioPreindex(segments, docs);
|
return new PrioPreindex(segments, docs);
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -81,16 +79,16 @@ public class PrioPreindex {
|
|||||||
Files.deleteIfExists(outputFileWords);
|
Files.deleteIfExists(outputFileWords);
|
||||||
|
|
||||||
// Estimate the size of the docs index data
|
// Estimate the size of the docs index data
|
||||||
offsets.transformEach(0, offsets.size(), new CountToOffsetTransformer(2));
|
offsets.transformEach(0, offsets.size(), new CountToOffsetTransformer(1));
|
||||||
IndexSizeEstimator sizeEstimator = new IndexSizeEstimator(ReverseIndexParameters.docsBTreeContext, 2);
|
IndexSizeEstimator sizeEstimator = new IndexSizeEstimator(ReverseIndexParameters.prioDocsBTreeContext, 1);
|
||||||
offsets.fold(0, 0, offsets.size(), sizeEstimator);
|
offsets.fold(0, 0, offsets.size(), sizeEstimator);
|
||||||
|
|
||||||
// Write the docs file
|
// Write the docs file
|
||||||
LongArray finalDocs = LongArrayFactory.mmapForWritingConfined(outputFileDocs, sizeEstimator.size);
|
LongArray finalDocs = LongArrayFactory.mmapForWritingConfined(outputFileDocs, sizeEstimator.size);
|
||||||
try (var intermediateDocChannel = documents.createDocumentsFileChannel()) {
|
try (var intermediateDocChannel = documents.createDocumentsFileChannel()) {
|
||||||
offsets.transformEachIO(0, offsets.size(),
|
offsets.transformEachIO(0, offsets.size(),
|
||||||
new PrioIndexBTreeTransformer(finalDocs, 2,
|
new PrioIndexBTreeTransformer(finalDocs, 1,
|
||||||
ReverseIndexParameters.docsBTreeContext,
|
ReverseIndexParameters.prioDocsBTreeContext,
|
||||||
intermediateDocChannel));
|
intermediateDocChannel));
|
||||||
intermediateDocChannel.force(false);
|
intermediateDocChannel.force(false);
|
||||||
}
|
}
|
||||||
@ -137,9 +135,9 @@ public class PrioPreindex {
|
|||||||
PrioPreindexWordSegments mergingSegment =
|
PrioPreindexWordSegments mergingSegment =
|
||||||
createMergedSegmentWordFile(destDir, left.segments, right.segments);
|
createMergedSegmentWordFile(destDir, left.segments, right.segments);
|
||||||
|
|
||||||
var mergingIter = mergingSegment.constructionIterator(2);
|
var mergingIter = mergingSegment.constructionIterator(1);
|
||||||
var leftIter = left.segments.iterator(2);
|
var leftIter = left.segments.iterator(1);
|
||||||
var rightIter = right.segments.iterator(2);
|
var rightIter = right.segments.iterator(1);
|
||||||
|
|
||||||
Path docsFile = Files.createTempFile(destDir, "docs", ".dat");
|
Path docsFile = Files.createTempFile(destDir, "docs", ".dat");
|
||||||
|
|
||||||
@ -200,7 +198,7 @@ public class PrioPreindex {
|
|||||||
// duplicates in the data, so we need to shrink it to the actual size we wrote.
|
// duplicates in the data, so we need to shrink it to the actual size we wrote.
|
||||||
|
|
||||||
mergedDocuments = shrinkMergedDocuments(mergedDocuments,
|
mergedDocuments = shrinkMergedDocuments(mergedDocuments,
|
||||||
docsFile, 2 * mergingSegment.totalSize());
|
docsFile, mergingSegment.totalSize());
|
||||||
|
|
||||||
return new PrioPreindex(
|
return new PrioPreindex(
|
||||||
mergingSegment,
|
mergingSegment,
|
||||||
@ -274,8 +272,7 @@ public class PrioPreindex {
|
|||||||
leftIter.startOffset, leftIter.endOffset,
|
leftIter.startOffset, leftIter.endOffset,
|
||||||
rightIter.startOffset, rightIter.endOffset);
|
rightIter.startOffset, rightIter.endOffset);
|
||||||
|
|
||||||
long distinct = segSize / 2;
|
destIter.putNext(segSize);
|
||||||
destIter.putNext(distinct);
|
|
||||||
leftIter.next();
|
leftIter.next();
|
||||||
rightIter.next();
|
rightIter.next();
|
||||||
}
|
}
|
||||||
@ -297,7 +294,7 @@ public class PrioPreindex {
|
|||||||
mergingIter.startOffset,
|
mergingIter.startOffset,
|
||||||
end);
|
end);
|
||||||
|
|
||||||
boolean putNext = mergingIter.putNext(size / 2);
|
boolean putNext = mergingIter.putNext(size);
|
||||||
boolean iterNext = sourceIter.next();
|
boolean iterNext = sourceIter.next();
|
||||||
|
|
||||||
if (!putNext && iterNext)
|
if (!putNext && iterNext)
|
||||||
|
@ -4,7 +4,6 @@ import lombok.SneakyThrows;
|
|||||||
import nu.marginalia.array.LongArray;
|
import nu.marginalia.array.LongArray;
|
||||||
import nu.marginalia.array.LongArrayFactory;
|
import nu.marginalia.array.LongArrayFactory;
|
||||||
import nu.marginalia.index.construction.DocIdRewriter;
|
import nu.marginalia.index.construction.DocIdRewriter;
|
||||||
import nu.marginalia.index.construction.PositionsFileConstructor;
|
|
||||||
import nu.marginalia.index.journal.reader.IndexJournalReader;
|
import nu.marginalia.index.journal.reader.IndexJournalReader;
|
||||||
import nu.marginalia.rwf.RandomFileAssembler;
|
import nu.marginalia.rwf.RandomFileAssembler;
|
||||||
import org.slf4j.Logger;
|
import org.slf4j.Logger;
|
||||||
@ -20,13 +19,12 @@ import java.util.concurrent.Executors;
|
|||||||
import java.util.concurrent.TimeUnit;
|
import java.util.concurrent.TimeUnit;
|
||||||
|
|
||||||
/** A LongArray with document data, segmented according to
|
/** A LongArray with document data, segmented according to
|
||||||
* the associated ReversePreindexWordSegments data
|
* the associated FullPreindexWordSegments data
|
||||||
*/
|
*/
|
||||||
public class PrioPreindexDocuments {
|
public class PrioPreindexDocuments {
|
||||||
public final LongArray documents;
|
public final LongArray documents;
|
||||||
|
|
||||||
private static PositionsFileConstructor positionsFileConstructor;
|
private static final int RECORD_SIZE_LONGS = 1;
|
||||||
private static final int RECORD_SIZE_LONGS = 2;
|
|
||||||
private static final Logger logger = LoggerFactory.getLogger(PrioPreindexDocuments.class);
|
private static final Logger logger = LoggerFactory.getLogger(PrioPreindexDocuments.class);
|
||||||
|
|
||||||
public final Path file;
|
public final Path file;
|
||||||
@ -41,9 +39,7 @@ public class PrioPreindexDocuments {
|
|||||||
Path workDir,
|
Path workDir,
|
||||||
IndexJournalReader reader,
|
IndexJournalReader reader,
|
||||||
DocIdRewriter docIdRewriter,
|
DocIdRewriter docIdRewriter,
|
||||||
PositionsFileConstructor positionsFileConstructor,
|
|
||||||
PrioPreindexWordSegments segments) throws IOException {
|
PrioPreindexWordSegments segments) throws IOException {
|
||||||
PrioPreindexDocuments.positionsFileConstructor = positionsFileConstructor;
|
|
||||||
|
|
||||||
createUnsortedDocsFile(docsFile, workDir, reader, segments, docIdRewriter);
|
createUnsortedDocsFile(docsFile, workDir, reader, segments, docIdRewriter);
|
||||||
|
|
||||||
@ -88,11 +84,7 @@ public class PrioPreindexDocuments {
|
|||||||
|
|
||||||
long offset = offsetMap.addTo(termId, RECORD_SIZE_LONGS);
|
long offset = offsetMap.addTo(termId, RECORD_SIZE_LONGS);
|
||||||
|
|
||||||
// write position data to the positions file and get the offset
|
assembly.put(offset, rankEncodedId);
|
||||||
long encodedPosOffset = positionsFileConstructor.add((byte) termData.metadata(), termData.positionsBuffer());
|
|
||||||
|
|
||||||
assembly.put(offset + 0, rankEncodedId);
|
|
||||||
assembly.put(offset + 1, encodedPosOffset);
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -112,11 +104,10 @@ public class PrioPreindexDocuments {
|
|||||||
long iterEnd = iter.endOffset;
|
long iterEnd = iter.endOffset;
|
||||||
|
|
||||||
if (iter.size() < 1024) {
|
if (iter.size() < 1024) {
|
||||||
docsFileMap.quickSortN(RECORD_SIZE_LONGS, iterStart, iterEnd);
|
docsFileMap.sort(iterStart, iterEnd);
|
||||||
}
|
}
|
||||||
else {
|
else {
|
||||||
sortingWorkers.execute(() ->
|
sortingWorkers.execute(() -> docsFileMap.sort(iterStart, iterEnd));
|
||||||
docsFileMap.quickSortN(RECORD_SIZE_LONGS, iterStart, iterEnd));
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -22,7 +22,7 @@ import java.util.List;
|
|||||||
import static nu.marginalia.index.construction.full.TestJournalFactory.wm;
|
import static nu.marginalia.index.construction.full.TestJournalFactory.wm;
|
||||||
import static org.junit.jupiter.api.Assertions.*;
|
import static org.junit.jupiter.api.Assertions.*;
|
||||||
|
|
||||||
class ReverseIndexReaderTest {
|
class FullReverseIndexReaderTest {
|
||||||
TestJournalFactory journalFactory;
|
TestJournalFactory journalFactory;
|
||||||
Path tempDir;
|
Path tempDir;
|
||||||
|
|
||||||
@ -82,7 +82,7 @@ class ReverseIndexReaderTest {
|
|||||||
|
|
||||||
}
|
}
|
||||||
|
|
||||||
private long[] readEntries(ReverseIndexReader reader, long wordId) {
|
private long[] readEntries(FullReverseIndexReader reader, long wordId) {
|
||||||
var es = reader.documents(wordId);
|
var es = reader.documents(wordId);
|
||||||
assertTrue(es.hasMore());
|
assertTrue(es.hasMore());
|
||||||
LongQueryBuffer buffer = new LongQueryBuffer(4);
|
LongQueryBuffer buffer = new LongQueryBuffer(4);
|
||||||
@ -91,7 +91,7 @@ class ReverseIndexReaderTest {
|
|||||||
return buffer.copyData();
|
return buffer.copyData();
|
||||||
}
|
}
|
||||||
|
|
||||||
private ReverseIndexReader createIndex(EntryDataWithWordMeta... scenario) throws IOException {
|
private FullReverseIndexReader createIndex(EntryDataWithWordMeta... scenario) throws IOException {
|
||||||
var reader = journalFactory.createReader(scenario);
|
var reader = journalFactory.createReader(scenario);
|
||||||
|
|
||||||
Path posFile = tempDir.resolve("positions.dat");
|
Path posFile = tempDir.resolve("positions.dat");
|
||||||
@ -106,7 +106,7 @@ class ReverseIndexReaderTest {
|
|||||||
preindex.delete();
|
preindex.delete();
|
||||||
}
|
}
|
||||||
|
|
||||||
return new ReverseIndexReader("test", wordsFile, docsFile, new PositionsFileReader(posFile));
|
return new FullReverseIndexReader("test", wordsFile, docsFile, new PositionsFileReader(posFile));
|
||||||
|
|
||||||
}
|
}
|
||||||
}
|
}
|
@ -26,7 +26,7 @@ public class ReverseIndexDebugTest {
|
|||||||
long wordOffset = wordsBTreeReader.findEntry(problemWord);
|
long wordOffset = wordsBTreeReader.findEntry(problemWord);
|
||||||
assertTrue(wordOffset >= 0);
|
assertTrue(wordOffset >= 0);
|
||||||
|
|
||||||
var docsReader = new BTreeReader(documents, ReverseIndexParameters.docsBTreeContext, wordOffset);
|
var docsReader = new BTreeReader(documents, ReverseIndexParameters.prioDocsBTreeContext, wordOffset);
|
||||||
|
|
||||||
// We find problemDoc even though it doesn't exist in the document range
|
// We find problemDoc even though it doesn't exist in the document range
|
||||||
long docOffset = docsReader.findEntry(problemDoc);
|
long docOffset = docsReader.findEntry(problemDoc);
|
||||||
|
@ -58,7 +58,7 @@ public class TestJournalFactory {
|
|||||||
return new WordWithMeta(wordId, meta, GammaCodedSequence.generate(ByteBuffer.allocate(128), positions));
|
return new WordWithMeta(wordId, meta, GammaCodedSequence.generate(ByteBuffer.allocate(128), positions));
|
||||||
}
|
}
|
||||||
|
|
||||||
IndexJournalReader createReader(EntryData... entries) throws IOException {
|
public IndexJournalReader createReader(EntryData... entries) throws IOException {
|
||||||
Path jf = Files.createTempFile(tempDir, "journal", ".dat");
|
Path jf = Files.createTempFile(tempDir, "journal", ".dat");
|
||||||
|
|
||||||
var writer = new IndexJournalWriterSingleFileImpl(jf);
|
var writer = new IndexJournalWriterSingleFileImpl(jf);
|
||||||
|
@ -0,0 +1,86 @@
|
|||||||
|
package nu.marginalia.index.construction.prio;
|
||||||
|
|
||||||
|
import nu.marginalia.array.page.LongQueryBuffer;
|
||||||
|
import nu.marginalia.index.PrioReverseIndexReader;
|
||||||
|
import nu.marginalia.index.construction.DocIdRewriter;
|
||||||
|
import nu.marginalia.index.construction.full.TestJournalFactory;
|
||||||
|
import org.junit.jupiter.api.AfterEach;
|
||||||
|
import org.junit.jupiter.api.BeforeEach;
|
||||||
|
import org.junit.jupiter.api.Test;
|
||||||
|
|
||||||
|
import java.io.IOException;
|
||||||
|
import java.nio.file.Files;
|
||||||
|
import java.nio.file.Path;
|
||||||
|
import java.util.ArrayList;
|
||||||
|
import java.util.List;
|
||||||
|
|
||||||
|
import static nu.marginalia.index.construction.full.TestJournalFactory.*;
|
||||||
|
import static nu.marginalia.index.construction.full.TestJournalFactory.wm;
|
||||||
|
import static org.junit.jupiter.api.Assertions.assertEquals;
|
||||||
|
import static org.junit.jupiter.api.Assertions.assertTrue;
|
||||||
|
|
||||||
|
class FullPreindexTest {
|
||||||
|
Path countsFile;
|
||||||
|
Path wordsIdFile;
|
||||||
|
Path docsFile;
|
||||||
|
Path tempDir;
|
||||||
|
Path positionsFile;
|
||||||
|
|
||||||
|
TestJournalFactory journalFactory;
|
||||||
|
|
||||||
|
@BeforeEach
|
||||||
|
public void setUp() throws IOException {
|
||||||
|
journalFactory = new TestJournalFactory();
|
||||||
|
|
||||||
|
countsFile = Files.createTempFile("counts", ".dat");
|
||||||
|
wordsIdFile = Files.createTempFile("words", ".dat");
|
||||||
|
docsFile = Files.createTempFile("docs", ".dat");
|
||||||
|
tempDir = Files.createTempDirectory("sort");
|
||||||
|
positionsFile = tempDir.resolve("positions.dat");
|
||||||
|
}
|
||||||
|
|
||||||
|
@AfterEach
|
||||||
|
public void tearDown() throws IOException {
|
||||||
|
journalFactory.clear();
|
||||||
|
|
||||||
|
Files.deleteIfExists(countsFile);
|
||||||
|
Files.deleteIfExists(wordsIdFile);
|
||||||
|
Files.deleteIfExists(positionsFile);
|
||||||
|
Files.deleteIfExists(docsFile);
|
||||||
|
|
||||||
|
List<Path> contents = new ArrayList<>();
|
||||||
|
Files.list(tempDir).forEach(contents::add);
|
||||||
|
for (var tempFile : contents) {
|
||||||
|
Files.delete(tempFile);
|
||||||
|
}
|
||||||
|
Files.delete(tempDir);
|
||||||
|
}
|
||||||
|
|
||||||
|
@Test
|
||||||
|
public void testFinalizeSimple() throws IOException {
|
||||||
|
var journalReader = journalFactory.createReader(
|
||||||
|
new EntryDataWithWordMeta(100, 101, wm(50, 51)),
|
||||||
|
new EntryDataWithWordMeta(104, 101, wm(50, 52))
|
||||||
|
);
|
||||||
|
|
||||||
|
var preindex = PrioPreindex.constructPreindex(journalReader, DocIdRewriter.identity(), tempDir);
|
||||||
|
preindex.finalizeIndex(tempDir.resolve( "docs.dat"), tempDir.resolve("words.dat"));
|
||||||
|
preindex.delete();
|
||||||
|
|
||||||
|
Path wordsFile = tempDir.resolve("words.dat");
|
||||||
|
Path docsFile = tempDir.resolve("docs.dat");
|
||||||
|
|
||||||
|
assertTrue(Files.exists(wordsFile));
|
||||||
|
assertTrue(Files.exists(docsFile));
|
||||||
|
|
||||||
|
var indexReader = new PrioReverseIndexReader("test", wordsFile, docsFile);
|
||||||
|
|
||||||
|
var entrySource = indexReader.documents(50);
|
||||||
|
var lqb = new LongQueryBuffer(32);
|
||||||
|
entrySource.read(lqb);
|
||||||
|
|
||||||
|
assertEquals(2, lqb.size());
|
||||||
|
assertEquals(100, lqb.copyData()[0]);
|
||||||
|
assertEquals(104, lqb.copyData()[1]);
|
||||||
|
}
|
||||||
|
}
|
@ -38,19 +38,18 @@ public class IndexFactory {
|
|||||||
return IndexLocations.getSearchSetsPath(fileStorageService);
|
return IndexLocations.getSearchSetsPath(fileStorageService);
|
||||||
}
|
}
|
||||||
|
|
||||||
public ReverseIndexReader getReverseIndexReader() throws IOException {
|
public FullReverseIndexReader getReverseIndexReader() throws IOException {
|
||||||
return new ReverseIndexReader("full",
|
return new FullReverseIndexReader("full",
|
||||||
ReverseIndexFullFileNames.resolve(liveStorage, ReverseIndexFullFileNames.FileIdentifier.WORDS, ReverseIndexFullFileNames.FileVersion.CURRENT),
|
ReverseIndexFullFileNames.resolve(liveStorage, ReverseIndexFullFileNames.FileIdentifier.WORDS, ReverseIndexFullFileNames.FileVersion.CURRENT),
|
||||||
ReverseIndexFullFileNames.resolve(liveStorage, ReverseIndexFullFileNames.FileIdentifier.DOCS, ReverseIndexFullFileNames.FileVersion.CURRENT),
|
ReverseIndexFullFileNames.resolve(liveStorage, ReverseIndexFullFileNames.FileIdentifier.DOCS, ReverseIndexFullFileNames.FileVersion.CURRENT),
|
||||||
new PositionsFileReader(ReverseIndexFullFileNames.resolve(liveStorage, ReverseIndexFullFileNames.FileIdentifier.POSITIONS, ReverseIndexFullFileNames.FileVersion.CURRENT))
|
new PositionsFileReader(ReverseIndexFullFileNames.resolve(liveStorage, ReverseIndexFullFileNames.FileIdentifier.POSITIONS, ReverseIndexFullFileNames.FileVersion.CURRENT))
|
||||||
);
|
);
|
||||||
}
|
}
|
||||||
|
|
||||||
public ReverseIndexReader getReverseIndexPrioReader() throws IOException {
|
public PrioReverseIndexReader getReverseIndexPrioReader() throws IOException {
|
||||||
return new ReverseIndexReader("prio",
|
return new PrioReverseIndexReader("prio",
|
||||||
ReverseIndexPrioFileNames.resolve(liveStorage, ReverseIndexPrioFileNames.FileIdentifier.WORDS, ReverseIndexPrioFileNames.FileVersion.CURRENT),
|
ReverseIndexPrioFileNames.resolve(liveStorage, ReverseIndexPrioFileNames.FileIdentifier.WORDS, ReverseIndexPrioFileNames.FileVersion.CURRENT),
|
||||||
ReverseIndexPrioFileNames.resolve(liveStorage, ReverseIndexPrioFileNames.FileIdentifier.DOCS, ReverseIndexPrioFileNames.FileVersion.CURRENT),
|
ReverseIndexPrioFileNames.resolve(liveStorage, ReverseIndexPrioFileNames.FileIdentifier.DOCS, ReverseIndexPrioFileNames.FileVersion.CURRENT)
|
||||||
null
|
|
||||||
);
|
);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -5,7 +5,8 @@ import it.unimi.dsi.fastutil.longs.LongList;
|
|||||||
import it.unimi.dsi.fastutil.longs.LongOpenHashSet;
|
import it.unimi.dsi.fastutil.longs.LongOpenHashSet;
|
||||||
import it.unimi.dsi.fastutil.longs.LongSet;
|
import it.unimi.dsi.fastutil.longs.LongSet;
|
||||||
import nu.marginalia.api.searchquery.model.compiled.aggregate.CompiledQueryAggregates;
|
import nu.marginalia.api.searchquery.model.compiled.aggregate.CompiledQueryAggregates;
|
||||||
import nu.marginalia.index.ReverseIndexReader;
|
import nu.marginalia.index.FullReverseIndexReader;
|
||||||
|
import nu.marginalia.index.PrioReverseIndexReader;
|
||||||
import nu.marginalia.index.forward.ForwardIndexReader;
|
import nu.marginalia.index.forward.ForwardIndexReader;
|
||||||
import nu.marginalia.index.model.QueryParams;
|
import nu.marginalia.index.model.QueryParams;
|
||||||
import nu.marginalia.index.model.SearchTerms;
|
import nu.marginalia.index.model.SearchTerms;
|
||||||
@ -38,30 +39,25 @@ public class CombinedIndexReader {
|
|||||||
private final Logger logger = LoggerFactory.getLogger(getClass());
|
private final Logger logger = LoggerFactory.getLogger(getClass());
|
||||||
|
|
||||||
private final ForwardIndexReader forwardIndexReader;
|
private final ForwardIndexReader forwardIndexReader;
|
||||||
private final ReverseIndexReader reverseIndexFullReader;
|
private final FullReverseIndexReader reverseIndexFullReader;
|
||||||
private final ReverseIndexReader reverseIndexPriorityReader;
|
private final PrioReverseIndexReader reverseIndexPriorityReader;
|
||||||
|
|
||||||
public CombinedIndexReader(ForwardIndexReader forwardIndexReader,
|
public CombinedIndexReader(ForwardIndexReader forwardIndexReader,
|
||||||
ReverseIndexReader reverseIndexFullReader,
|
FullReverseIndexReader reverseIndexFullReader,
|
||||||
ReverseIndexReader reverseIndexPriorityReader) {
|
PrioReverseIndexReader reverseIndexPriorityReader) {
|
||||||
this.forwardIndexReader = forwardIndexReader;
|
this.forwardIndexReader = forwardIndexReader;
|
||||||
this.reverseIndexFullReader = reverseIndexFullReader;
|
this.reverseIndexFullReader = reverseIndexFullReader;
|
||||||
this.reverseIndexPriorityReader = reverseIndexPriorityReader;
|
this.reverseIndexPriorityReader = reverseIndexPriorityReader;
|
||||||
}
|
}
|
||||||
|
|
||||||
public IndexQueryBuilderImpl newQueryBuilder(IndexQuery query) {
|
public IndexQueryBuilderImpl newQueryBuilder(IndexQuery query) {
|
||||||
return new IndexQueryBuilderImpl(reverseIndexFullReader, reverseIndexPriorityReader, query);
|
return new IndexQueryBuilderImpl(reverseIndexFullReader, query);
|
||||||
}
|
}
|
||||||
|
|
||||||
public QueryFilterStepIf hasWordFull(long termId) {
|
public QueryFilterStepIf hasWordFull(long termId) {
|
||||||
return reverseIndexFullReader.also(termId);
|
return reverseIndexFullReader.also(termId);
|
||||||
}
|
}
|
||||||
|
|
||||||
public QueryFilterStepIf hasWordPrio(long termId) {
|
|
||||||
return reverseIndexPriorityReader.also(termId);
|
|
||||||
}
|
|
||||||
|
|
||||||
|
|
||||||
/** Creates a query builder for terms in the priority index */
|
/** Creates a query builder for terms in the priority index */
|
||||||
public IndexQueryBuilder findPriorityWord(long wordId) {
|
public IndexQueryBuilder findPriorityWord(long wordId) {
|
||||||
return newQueryBuilder(new IndexQuery(reverseIndexPriorityReader.documents(wordId)))
|
return newQueryBuilder(new IndexQuery(reverseIndexPriorityReader.documents(wordId)))
|
||||||
@ -124,7 +120,7 @@ public class CombinedIndexReader {
|
|||||||
if (paths.size() < 4) {
|
if (paths.size() < 4) {
|
||||||
var prioHead = findPriorityWord(elements.getLong(0));
|
var prioHead = findPriorityWord(elements.getLong(0));
|
||||||
for (int i = 1; i < elements.size(); i++) {
|
for (int i = 1; i < elements.size(); i++) {
|
||||||
prioHead.addInclusionFilter(hasWordPrio(elements.getLong(i)));
|
prioHead.addInclusionFilter(hasWordFull(elements.getLong(i)));
|
||||||
}
|
}
|
||||||
queryHeads.add(prioHead);
|
queryHeads.add(prioHead);
|
||||||
}
|
}
|
||||||
|
@ -2,7 +2,8 @@ package nu.marginalia.index.index;
|
|||||||
|
|
||||||
import java.util.List;
|
import java.util.List;
|
||||||
import gnu.trove.set.hash.TLongHashSet;
|
import gnu.trove.set.hash.TLongHashSet;
|
||||||
import nu.marginalia.index.ReverseIndexReader;
|
import nu.marginalia.index.FullReverseIndexReader;
|
||||||
|
import nu.marginalia.index.PrioReverseIndexReader;
|
||||||
import nu.marginalia.index.query.IndexQuery;
|
import nu.marginalia.index.query.IndexQuery;
|
||||||
import nu.marginalia.index.query.IndexQueryBuilder;
|
import nu.marginalia.index.query.IndexQueryBuilder;
|
||||||
import nu.marginalia.index.query.filter.QueryFilterAnyOf;
|
import nu.marginalia.index.query.filter.QueryFilterAnyOf;
|
||||||
@ -10,8 +11,7 @@ import nu.marginalia.index.query.filter.QueryFilterStepIf;
|
|||||||
|
|
||||||
public class IndexQueryBuilderImpl implements IndexQueryBuilder {
|
public class IndexQueryBuilderImpl implements IndexQueryBuilder {
|
||||||
private final IndexQuery query;
|
private final IndexQuery query;
|
||||||
private final ReverseIndexReader reverseIndexFullReader;
|
private final FullReverseIndexReader reverseIndexFullReader;
|
||||||
private final ReverseIndexReader reverseIndexPrioReader;
|
|
||||||
|
|
||||||
/* Keep track of already added include terms to avoid redundant checks.
|
/* Keep track of already added include terms to avoid redundant checks.
|
||||||
*
|
*
|
||||||
@ -21,13 +21,10 @@ public class IndexQueryBuilderImpl implements IndexQueryBuilder {
|
|||||||
* */
|
* */
|
||||||
private final TLongHashSet alreadyConsideredTerms = new TLongHashSet();
|
private final TLongHashSet alreadyConsideredTerms = new TLongHashSet();
|
||||||
|
|
||||||
IndexQueryBuilderImpl(ReverseIndexReader reverseIndexFullReader,
|
IndexQueryBuilderImpl(FullReverseIndexReader reverseIndexFullReader, IndexQuery query)
|
||||||
ReverseIndexReader reverseIndexPrioReader,
|
|
||||||
IndexQuery query)
|
|
||||||
{
|
{
|
||||||
this.query = query;
|
this.query = query;
|
||||||
this.reverseIndexFullReader = reverseIndexFullReader;
|
this.reverseIndexFullReader = reverseIndexFullReader;
|
||||||
this.reverseIndexPrioReader = reverseIndexPrioReader;
|
|
||||||
}
|
}
|
||||||
|
|
||||||
public IndexQueryBuilder withSourceTerms(long... sourceTerms) {
|
public IndexQueryBuilder withSourceTerms(long... sourceTerms) {
|
||||||
|
@ -18,6 +18,7 @@ import nu.marginalia.index.IndexGrpcService;
|
|||||||
import nu.marginalia.index.ReverseIndexFullFileNames;
|
import nu.marginalia.index.ReverseIndexFullFileNames;
|
||||||
import nu.marginalia.index.ReverseIndexPrioFileNames;
|
import nu.marginalia.index.ReverseIndexPrioFileNames;
|
||||||
import nu.marginalia.index.construction.full.FullIndexConstructor;
|
import nu.marginalia.index.construction.full.FullIndexConstructor;
|
||||||
|
import nu.marginalia.index.construction.prio.PrioIndexConstructor;
|
||||||
import nu.marginalia.index.domainrankings.DomainRankings;
|
import nu.marginalia.index.domainrankings.DomainRankings;
|
||||||
import nu.marginalia.index.forward.ForwardIndexConverter;
|
import nu.marginalia.index.forward.ForwardIndexConverter;
|
||||||
import nu.marginalia.index.forward.ForwardIndexFileNames;
|
import nu.marginalia.index.forward.ForwardIndexFileNames;
|
||||||
@ -269,7 +270,7 @@ public class IntegrationTest {
|
|||||||
// important to the document. This filter will act on the encoded {@see WordMetadata}
|
// important to the document. This filter will act on the encoded {@see WordMetadata}
|
||||||
LongPredicate wordMetaFilter = getPriorityIndexWordMetaFilter();
|
LongPredicate wordMetaFilter = getPriorityIndexWordMetaFilter();
|
||||||
|
|
||||||
var constructor = new FullIndexConstructor(
|
var constructor = new PrioIndexConstructor(
|
||||||
outputFileDocs,
|
outputFileDocs,
|
||||||
outputFileWords,
|
outputFileWords,
|
||||||
outputFilePositions,
|
outputFilePositions,
|
||||||
|
Loading…
Reference in New Issue
Block a user