(index) Hook in missing DocIdRewriter

This enables documents to be ranked properly.
This commit is contained in:
Viktor Lofgren 2023-08-28 19:53:43 +02:00
parent ffa0366deb
commit b6a92506d1
8 changed files with 37 additions and 21 deletions

View File

@ -17,6 +17,7 @@ public class ReverseIndexConstructor {
public static void createReverseIndex(
JournalReaderSource readerSource,
Path sourceBaseDir,
DocIdRewriter docIdRewriter,
Path tmpDir,
Path outputFileDocs,
Path outputFileWords) throws IOException
@ -31,8 +32,7 @@ public class ReverseIndexConstructor {
for (var input : inputs) {
logger.info("Construcing preindex from {}", input);
var preindex = ReversePreindex.constructPreindex(readerSource.construct(input),
tmpDir, tmpDir);
var preindex = ReversePreindex.constructPreindex(readerSource.construct(input), docIdRewriter, tmpDir, tmpDir);
preindexes.add(preindex);
}

View File

@ -73,6 +73,7 @@ public class ReversePreindex {
documents.delete();
}
public static ReversePreindex constructPreindex(IndexJournalReader reader,
DocIdRewriter docIdRewriter,
Path tempDir,
Path destDir) throws IOException
{
@ -84,7 +85,7 @@ public class ReversePreindex {
logger.info("Segmenting");
var segments = ReversePreindexWordSegments.construct(reader, ctx, segmentWordsFile, segmentCountsFile);
logger.info("Mapping docs");
var docs = ReversePreindexDocuments.construct(docsFile, reader, DocIdRewriter.identity(), ctx, segments);
var docs = ReversePreindexDocuments.construct(docsFile, reader, docIdRewriter, ctx, segments);
logger.info("Done");
return new ReversePreindex(segments, docs);
}

View File

@ -17,7 +17,7 @@ import java.util.concurrent.Executors;
import java.util.concurrent.TimeUnit;
/** A LongArray with document data, segmented according to
* the associated ReversePReindexWordSegments data
* the associated ReversePreindexWordSegments data
*/
public class ReversePreindexDocuments {
private final Path file;
@ -38,7 +38,7 @@ public class ReversePreindexDocuments {
ReversePreindexWordSegments segments) throws IOException {
logger.info("Transfering data");
logger.info("Transferring data");
createUnsortedDocsFile(docsFile, reader, segments, docIdRewriter);
LongArray docsFileMap = LongArray.mmapForWriting(docsFile, 8 * Files.size(docsFile));
@ -71,16 +71,20 @@ public class ReversePreindexDocuments {
var offsetMap = segments.asMap(RECORD_SIZE_LONGS);
offsetMap.defaultReturnValue(0);
reader.forEachDocIdRecord((docId, rec) -> {
long wordId = rec.wordId();
long meta = rec.metadata();
for (var entry : reader) {
long rankEncodedId = docIdRewriter.rewriteDocId(entry.docId());
long rankEncodedId = docIdRewriter.rewriteDocId(docId);
var data = entry.readEntry();
for (int i = 0; i + 1 < data.size(); i+=2) {
long wordId = data.get(i);
long meta = data.get(i+1);
long offset = offsetMap.addTo(wordId, RECORD_SIZE_LONGS);
outArray.set(offset + 0, rankEncodedId);
outArray.set(offset + 1, meta);
});
}
}
outArray.force();
}

View File

@ -2,6 +2,7 @@ package nu.marginalia.index;
import nu.marginalia.array.algo.SortingContext;
import nu.marginalia.array.buffer.LongQueryBuffer;
import nu.marginalia.index.construction.DocIdRewriter;
import nu.marginalia.index.construction.ReversePreindex;
import nu.marginalia.index.construction.TestJournalFactory;
import nu.marginalia.index.construction.TestJournalFactory.EntryDataWithWordMeta;
@ -93,7 +94,7 @@ class ReverseIndexReaderTest {
private ReverseIndexReader createIndex(EntryDataWithWordMeta... scenario) throws IOException {
var reader = journalFactory.createReader(scenario);
var preindex = ReversePreindex.constructPreindex(reader, tempDir, tempDir);
var preindex = ReversePreindex.constructPreindex(reader, DocIdRewriter.identity(), tempDir, tempDir);
Path docsFile = tempDir.resolve("docs.dat");

View File

@ -54,7 +54,7 @@ class ReversePreindexFinalizeTest {
@Test
public void testFinalizeSimple() throws IOException {
var reader = journalFactory.createReader(new EntryDataWithWordMeta(100, 101, wm(50, 51)));
var preindex = ReversePreindex.constructPreindex(reader, tempDir, tempDir);
var preindex = ReversePreindex.constructPreindex(reader, DocIdRewriter.identity(), tempDir, tempDir);
preindex.finalizeIndex(tempDir.resolve( "docs.dat"), tempDir.resolve("words.dat"));
@ -92,7 +92,7 @@ class ReversePreindexFinalizeTest {
new EntryDataWithWordMeta(101, 101, wm(51, 52))
);
var preindex = ReversePreindex.constructPreindex(reader, tempDir, tempDir);
var preindex = ReversePreindex.constructPreindex(reader, DocIdRewriter.identity(), tempDir, tempDir);
preindex.finalizeIndex(tempDir.resolve( "docs.dat"), tempDir.resolve("words.dat"));
preindex.delete();

View File

@ -54,8 +54,8 @@ class ReversePreindexMergeTest {
var reader1 = journalFactory.createReader(leftData.toArray(EntryDataWithWordMeta[]::new));
var reader2 = journalFactory.createReader(rightData.toArray(EntryDataWithWordMeta[]::new));
var left = ReversePreindex.constructPreindex(reader1, tempDir, tempDir);
var right = ReversePreindex.constructPreindex(reader2, tempDir, tempDir);
var left = ReversePreindex.constructPreindex(reader1, DocIdRewriter.identity(), tempDir, tempDir);
var right = ReversePreindex.constructPreindex(reader2, DocIdRewriter.identity(), tempDir, tempDir);
return ReversePreindex.merge(tempDir, left, right);
}

View File

@ -6,6 +6,7 @@ import com.google.inject.Inject;
import nu.marginalia.db.storage.FileStorageService;
import nu.marginalia.db.storage.model.FileStorage;
import nu.marginalia.db.storage.model.FileStorageType;
import nu.marginalia.index.construction.DocIdRewriter;
import nu.marginalia.index.construction.ReverseIndexConstructor;
import nu.marginalia.index.forward.ForwardIndexConverter;
import nu.marginalia.index.forward.ForwardIndexFileNames;
@ -13,6 +14,7 @@ import nu.marginalia.index.journal.model.IndexJournalEntryData;
import nu.marginalia.index.journal.reader.IndexJournalReadEntry;
import nu.marginalia.index.journal.reader.IndexJournalReader;
import nu.marginalia.model.gson.GsonFactory;
import nu.marginalia.model.id.UrlIdCodec;
import nu.marginalia.mq.MessageQueueFactory;
import nu.marginalia.mq.MqMessage;
import nu.marginalia.mq.inbox.MqInboxResponse;
@ -105,6 +107,7 @@ public class IndexConstructorMain {
ReverseIndexConstructor.
createReverseIndex(IndexJournalReader::singleFile,
indexStaging.asPath(),
this::addRank,
tmpDir,
outputFileDocs,
outputFileWords);
@ -123,7 +126,7 @@ public class IndexConstructorMain {
ReverseIndexConstructor.
createReverseIndex(IndexJournalReader::singleFileWithPriorityFilters,
indexStaging.asPath(), tmpDir, outputFileDocs, outputFileWords);
indexStaging.asPath(), this::addRank, tmpDir, outputFileDocs, outputFileWords);
}
private void createForwardIndex() throws SQLException, IOException {
@ -144,7 +147,13 @@ public class IndexConstructorMain {
converter.convert();
}
private long addRank(long docId) {
float rank = domainRankings.getSortRanking(docId);
return UrlIdCodec.addRank(rank, docId);
}
private class CreateIndexInstructions {
public final IndexName name;
private final MqSingleShotInbox inbox;
private final MqMessage message;

View File

@ -13,6 +13,7 @@ import nu.marginalia.index.client.model.query.SearchSubquery;
import nu.marginalia.index.client.model.query.SearchSetIdentifier;
import nu.marginalia.index.client.model.results.ResultRankingParameters;
import nu.marginalia.index.client.model.results.SearchResultItem;
import nu.marginalia.index.construction.DocIdRewriter;
import nu.marginalia.index.construction.ReverseIndexConstructor;
import nu.marginalia.index.forward.ForwardIndexConverter;
import nu.marginalia.index.forward.ForwardIndexFileNames;
@ -148,7 +149,7 @@ public class IndexQueryServiceIntegrationTest {
ReverseIndexConstructor.
createReverseIndex(IndexJournalReader::singleFile, indexStaging.asPath(), tmpDir, outputFileDocs, outputFileWords);
createReverseIndex(IndexJournalReader::singleFile, indexStaging.asPath(), DocIdRewriter.identity(), tmpDir, outputFileDocs, outputFileWords);
}
private void createPrioReverseIndex() throws SQLException, IOException {
@ -163,7 +164,7 @@ public class IndexQueryServiceIntegrationTest {
if (!Files.isDirectory(tmpDir)) Files.createDirectories(tmpDir);
ReverseIndexConstructor.
createReverseIndex(IndexJournalReader::singleFile, indexStaging.asPath(), tmpDir, outputFileDocs, outputFileWords);
createReverseIndex(IndexJournalReader::singleFile, indexStaging.asPath(), DocIdRewriter.identity(), tmpDir, outputFileDocs, outputFileWords);
}
private void createForwardIndex() throws SQLException, IOException {