mirror of
https://github.com/MarginaliaSearch/MarginaliaSearch.git
synced 2025-02-24 05:18:58 +00:00
(index) Hook in missing DocIdRewriter
This enables documents to be ranked properly.
This commit is contained in:
parent
ffa0366deb
commit
b6a92506d1
@ -17,6 +17,7 @@ public class ReverseIndexConstructor {
|
||||
public static void createReverseIndex(
|
||||
JournalReaderSource readerSource,
|
||||
Path sourceBaseDir,
|
||||
DocIdRewriter docIdRewriter,
|
||||
Path tmpDir,
|
||||
Path outputFileDocs,
|
||||
Path outputFileWords) throws IOException
|
||||
@ -31,8 +32,7 @@ public class ReverseIndexConstructor {
|
||||
|
||||
for (var input : inputs) {
|
||||
logger.info("Construcing preindex from {}", input);
|
||||
var preindex = ReversePreindex.constructPreindex(readerSource.construct(input),
|
||||
tmpDir, tmpDir);
|
||||
var preindex = ReversePreindex.constructPreindex(readerSource.construct(input), docIdRewriter, tmpDir, tmpDir);
|
||||
preindexes.add(preindex);
|
||||
}
|
||||
|
||||
|
@ -73,6 +73,7 @@ public class ReversePreindex {
|
||||
documents.delete();
|
||||
}
|
||||
public static ReversePreindex constructPreindex(IndexJournalReader reader,
|
||||
DocIdRewriter docIdRewriter,
|
||||
Path tempDir,
|
||||
Path destDir) throws IOException
|
||||
{
|
||||
@ -84,7 +85,7 @@ public class ReversePreindex {
|
||||
logger.info("Segmenting");
|
||||
var segments = ReversePreindexWordSegments.construct(reader, ctx, segmentWordsFile, segmentCountsFile);
|
||||
logger.info("Mapping docs");
|
||||
var docs = ReversePreindexDocuments.construct(docsFile, reader, DocIdRewriter.identity(), ctx, segments);
|
||||
var docs = ReversePreindexDocuments.construct(docsFile, reader, docIdRewriter, ctx, segments);
|
||||
logger.info("Done");
|
||||
return new ReversePreindex(segments, docs);
|
||||
}
|
||||
|
@ -17,7 +17,7 @@ import java.util.concurrent.Executors;
|
||||
import java.util.concurrent.TimeUnit;
|
||||
|
||||
/** A LongArray with document data, segmented according to
|
||||
* the associated ReversePReindexWordSegments data
|
||||
* the associated ReversePreindexWordSegments data
|
||||
*/
|
||||
public class ReversePreindexDocuments {
|
||||
private final Path file;
|
||||
@ -38,7 +38,7 @@ public class ReversePreindexDocuments {
|
||||
ReversePreindexWordSegments segments) throws IOException {
|
||||
|
||||
|
||||
logger.info("Transfering data");
|
||||
logger.info("Transferring data");
|
||||
createUnsortedDocsFile(docsFile, reader, segments, docIdRewriter);
|
||||
|
||||
LongArray docsFileMap = LongArray.mmapForWriting(docsFile, 8 * Files.size(docsFile));
|
||||
@ -71,16 +71,20 @@ public class ReversePreindexDocuments {
|
||||
var offsetMap = segments.asMap(RECORD_SIZE_LONGS);
|
||||
offsetMap.defaultReturnValue(0);
|
||||
|
||||
reader.forEachDocIdRecord((docId, rec) -> {
|
||||
long wordId = rec.wordId();
|
||||
long meta = rec.metadata();
|
||||
for (var entry : reader) {
|
||||
long rankEncodedId = docIdRewriter.rewriteDocId(entry.docId());
|
||||
|
||||
long rankEncodedId = docIdRewriter.rewriteDocId(docId);
|
||||
var data = entry.readEntry();
|
||||
for (int i = 0; i + 1 < data.size(); i+=2) {
|
||||
long wordId = data.get(i);
|
||||
long meta = data.get(i+1);
|
||||
|
||||
long offset = offsetMap.addTo(wordId, RECORD_SIZE_LONGS);
|
||||
|
||||
outArray.set(offset + 0, rankEncodedId);
|
||||
outArray.set(offset + 1, meta);
|
||||
});
|
||||
}
|
||||
}
|
||||
|
||||
outArray.force();
|
||||
}
|
||||
|
@ -2,6 +2,7 @@ package nu.marginalia.index;
|
||||
|
||||
import nu.marginalia.array.algo.SortingContext;
|
||||
import nu.marginalia.array.buffer.LongQueryBuffer;
|
||||
import nu.marginalia.index.construction.DocIdRewriter;
|
||||
import nu.marginalia.index.construction.ReversePreindex;
|
||||
import nu.marginalia.index.construction.TestJournalFactory;
|
||||
import nu.marginalia.index.construction.TestJournalFactory.EntryDataWithWordMeta;
|
||||
@ -93,7 +94,7 @@ class ReverseIndexReaderTest {
|
||||
|
||||
private ReverseIndexReader createIndex(EntryDataWithWordMeta... scenario) throws IOException {
|
||||
var reader = journalFactory.createReader(scenario);
|
||||
var preindex = ReversePreindex.constructPreindex(reader, tempDir, tempDir);
|
||||
var preindex = ReversePreindex.constructPreindex(reader, DocIdRewriter.identity(), tempDir, tempDir);
|
||||
|
||||
|
||||
Path docsFile = tempDir.resolve("docs.dat");
|
||||
|
@ -54,7 +54,7 @@ class ReversePreindexFinalizeTest {
|
||||
@Test
|
||||
public void testFinalizeSimple() throws IOException {
|
||||
var reader = journalFactory.createReader(new EntryDataWithWordMeta(100, 101, wm(50, 51)));
|
||||
var preindex = ReversePreindex.constructPreindex(reader, tempDir, tempDir);
|
||||
var preindex = ReversePreindex.constructPreindex(reader, DocIdRewriter.identity(), tempDir, tempDir);
|
||||
|
||||
|
||||
preindex.finalizeIndex(tempDir.resolve( "docs.dat"), tempDir.resolve("words.dat"));
|
||||
@ -92,7 +92,7 @@ class ReversePreindexFinalizeTest {
|
||||
new EntryDataWithWordMeta(101, 101, wm(51, 52))
|
||||
);
|
||||
|
||||
var preindex = ReversePreindex.constructPreindex(reader, tempDir, tempDir);
|
||||
var preindex = ReversePreindex.constructPreindex(reader, DocIdRewriter.identity(), tempDir, tempDir);
|
||||
|
||||
preindex.finalizeIndex(tempDir.resolve( "docs.dat"), tempDir.resolve("words.dat"));
|
||||
preindex.delete();
|
||||
|
@ -54,8 +54,8 @@ class ReversePreindexMergeTest {
|
||||
var reader1 = journalFactory.createReader(leftData.toArray(EntryDataWithWordMeta[]::new));
|
||||
var reader2 = journalFactory.createReader(rightData.toArray(EntryDataWithWordMeta[]::new));
|
||||
|
||||
var left = ReversePreindex.constructPreindex(reader1, tempDir, tempDir);
|
||||
var right = ReversePreindex.constructPreindex(reader2, tempDir, tempDir);
|
||||
var left = ReversePreindex.constructPreindex(reader1, DocIdRewriter.identity(), tempDir, tempDir);
|
||||
var right = ReversePreindex.constructPreindex(reader2, DocIdRewriter.identity(), tempDir, tempDir);
|
||||
return ReversePreindex.merge(tempDir, left, right);
|
||||
}
|
||||
|
||||
|
@ -6,6 +6,7 @@ import com.google.inject.Inject;
|
||||
import nu.marginalia.db.storage.FileStorageService;
|
||||
import nu.marginalia.db.storage.model.FileStorage;
|
||||
import nu.marginalia.db.storage.model.FileStorageType;
|
||||
import nu.marginalia.index.construction.DocIdRewriter;
|
||||
import nu.marginalia.index.construction.ReverseIndexConstructor;
|
||||
import nu.marginalia.index.forward.ForwardIndexConverter;
|
||||
import nu.marginalia.index.forward.ForwardIndexFileNames;
|
||||
@ -13,6 +14,7 @@ import nu.marginalia.index.journal.model.IndexJournalEntryData;
|
||||
import nu.marginalia.index.journal.reader.IndexJournalReadEntry;
|
||||
import nu.marginalia.index.journal.reader.IndexJournalReader;
|
||||
import nu.marginalia.model.gson.GsonFactory;
|
||||
import nu.marginalia.model.id.UrlIdCodec;
|
||||
import nu.marginalia.mq.MessageQueueFactory;
|
||||
import nu.marginalia.mq.MqMessage;
|
||||
import nu.marginalia.mq.inbox.MqInboxResponse;
|
||||
@ -105,6 +107,7 @@ public class IndexConstructorMain {
|
||||
ReverseIndexConstructor.
|
||||
createReverseIndex(IndexJournalReader::singleFile,
|
||||
indexStaging.asPath(),
|
||||
this::addRank,
|
||||
tmpDir,
|
||||
outputFileDocs,
|
||||
outputFileWords);
|
||||
@ -123,7 +126,7 @@ public class IndexConstructorMain {
|
||||
|
||||
ReverseIndexConstructor.
|
||||
createReverseIndex(IndexJournalReader::singleFileWithPriorityFilters,
|
||||
indexStaging.asPath(), tmpDir, outputFileDocs, outputFileWords);
|
||||
indexStaging.asPath(), this::addRank, tmpDir, outputFileDocs, outputFileWords);
|
||||
}
|
||||
|
||||
private void createForwardIndex() throws SQLException, IOException {
|
||||
@ -144,7 +147,13 @@ public class IndexConstructorMain {
|
||||
converter.convert();
|
||||
}
|
||||
|
||||
private long addRank(long docId) {
|
||||
float rank = domainRankings.getSortRanking(docId);
|
||||
return UrlIdCodec.addRank(rank, docId);
|
||||
}
|
||||
|
||||
private class CreateIndexInstructions {
|
||||
|
||||
public final IndexName name;
|
||||
private final MqSingleShotInbox inbox;
|
||||
private final MqMessage message;
|
||||
|
@ -13,6 +13,7 @@ import nu.marginalia.index.client.model.query.SearchSubquery;
|
||||
import nu.marginalia.index.client.model.query.SearchSetIdentifier;
|
||||
import nu.marginalia.index.client.model.results.ResultRankingParameters;
|
||||
import nu.marginalia.index.client.model.results.SearchResultItem;
|
||||
import nu.marginalia.index.construction.DocIdRewriter;
|
||||
import nu.marginalia.index.construction.ReverseIndexConstructor;
|
||||
import nu.marginalia.index.forward.ForwardIndexConverter;
|
||||
import nu.marginalia.index.forward.ForwardIndexFileNames;
|
||||
@ -148,7 +149,7 @@ public class IndexQueryServiceIntegrationTest {
|
||||
|
||||
|
||||
ReverseIndexConstructor.
|
||||
createReverseIndex(IndexJournalReader::singleFile, indexStaging.asPath(), tmpDir, outputFileDocs, outputFileWords);
|
||||
createReverseIndex(IndexJournalReader::singleFile, indexStaging.asPath(), DocIdRewriter.identity(), tmpDir, outputFileDocs, outputFileWords);
|
||||
}
|
||||
|
||||
private void createPrioReverseIndex() throws SQLException, IOException {
|
||||
@ -163,7 +164,7 @@ public class IndexQueryServiceIntegrationTest {
|
||||
if (!Files.isDirectory(tmpDir)) Files.createDirectories(tmpDir);
|
||||
|
||||
ReverseIndexConstructor.
|
||||
createReverseIndex(IndexJournalReader::singleFile, indexStaging.asPath(), tmpDir, outputFileDocs, outputFileWords);
|
||||
createReverseIndex(IndexJournalReader::singleFile, indexStaging.asPath(), DocIdRewriter.identity(), tmpDir, outputFileDocs, outputFileWords);
|
||||
}
|
||||
|
||||
private void createForwardIndex() throws SQLException, IOException {
|
||||
|
Loading…
Reference in New Issue
Block a user