From b6a92506d1ebbfdb8823802c4e86b161261c5932 Mon Sep 17 00:00:00 2001 From: Viktor Lofgren Date: Mon, 28 Aug 2023 19:53:43 +0200 Subject: [PATCH] (index) Hook in missing DocIdRewriter This enables documents to be ranked properly. --- .../construction/ReverseIndexConstructor.java | 4 ++-- .../index/construction/ReversePreindex.java | 3 ++- .../ReversePreindexDocuments.java | 24 +++++++++++-------- .../index/ReverseIndexReaderTest.java | 3 ++- .../ReversePreindexFinalizeTest.java | 4 ++-- .../ReversePreindexMergeTest.java | 4 ++-- .../index/IndexConstructorMain.java | 11 ++++++++- .../svc/IndexQueryServiceIntegrationTest.java | 5 ++-- 8 files changed, 37 insertions(+), 21 deletions(-) diff --git a/code/features-index/index-reverse/src/main/java/nu/marginalia/index/construction/ReverseIndexConstructor.java b/code/features-index/index-reverse/src/main/java/nu/marginalia/index/construction/ReverseIndexConstructor.java index 259b5c16..91e6e60b 100644 --- a/code/features-index/index-reverse/src/main/java/nu/marginalia/index/construction/ReverseIndexConstructor.java +++ b/code/features-index/index-reverse/src/main/java/nu/marginalia/index/construction/ReverseIndexConstructor.java @@ -17,6 +17,7 @@ public class ReverseIndexConstructor { public static void createReverseIndex( JournalReaderSource readerSource, Path sourceBaseDir, + DocIdRewriter docIdRewriter, Path tmpDir, Path outputFileDocs, Path outputFileWords) throws IOException @@ -31,8 +32,7 @@ public class ReverseIndexConstructor { for (var input : inputs) { logger.info("Construcing preindex from {}", input); - var preindex = ReversePreindex.constructPreindex(readerSource.construct(input), - tmpDir, tmpDir); + var preindex = ReversePreindex.constructPreindex(readerSource.construct(input), docIdRewriter, tmpDir, tmpDir); preindexes.add(preindex); } diff --git a/code/features-index/index-reverse/src/main/java/nu/marginalia/index/construction/ReversePreindex.java b/code/features-index/index-reverse/src/main/java/nu/marginalia/index/construction/ReversePreindex.java index e5e1a5c2..19d3ad99 100644 --- a/code/features-index/index-reverse/src/main/java/nu/marginalia/index/construction/ReversePreindex.java +++ b/code/features-index/index-reverse/src/main/java/nu/marginalia/index/construction/ReversePreindex.java @@ -73,6 +73,7 @@ public class ReversePreindex { documents.delete(); } public static ReversePreindex constructPreindex(IndexJournalReader reader, + DocIdRewriter docIdRewriter, Path tempDir, Path destDir) throws IOException { @@ -84,7 +85,7 @@ public class ReversePreindex { logger.info("Segmenting"); var segments = ReversePreindexWordSegments.construct(reader, ctx, segmentWordsFile, segmentCountsFile); logger.info("Mapping docs"); - var docs = ReversePreindexDocuments.construct(docsFile, reader, DocIdRewriter.identity(), ctx, segments); + var docs = ReversePreindexDocuments.construct(docsFile, reader, docIdRewriter, ctx, segments); logger.info("Done"); return new ReversePreindex(segments, docs); } diff --git a/code/features-index/index-reverse/src/main/java/nu/marginalia/index/construction/ReversePreindexDocuments.java b/code/features-index/index-reverse/src/main/java/nu/marginalia/index/construction/ReversePreindexDocuments.java index b1faa6dd..4f5d0c61 100644 --- a/code/features-index/index-reverse/src/main/java/nu/marginalia/index/construction/ReversePreindexDocuments.java +++ b/code/features-index/index-reverse/src/main/java/nu/marginalia/index/construction/ReversePreindexDocuments.java @@ -17,7 +17,7 @@ import java.util.concurrent.Executors; import java.util.concurrent.TimeUnit; /** A LongArray with document data, segmented according to - * the associated ReversePReindexWordSegments data + * the associated ReversePreindexWordSegments data */ public class ReversePreindexDocuments { private final Path file; @@ -38,7 +38,7 @@ public class ReversePreindexDocuments { ReversePreindexWordSegments segments) throws IOException { - logger.info("Transfering data"); + logger.info("Transferring data"); createUnsortedDocsFile(docsFile, reader, segments, docIdRewriter); LongArray docsFileMap = LongArray.mmapForWriting(docsFile, 8 * Files.size(docsFile)); @@ -71,16 +71,20 @@ public class ReversePreindexDocuments { var offsetMap = segments.asMap(RECORD_SIZE_LONGS); offsetMap.defaultReturnValue(0); - reader.forEachDocIdRecord((docId, rec) -> { - long wordId = rec.wordId(); - long meta = rec.metadata(); + for (var entry : reader) { + long rankEncodedId = docIdRewriter.rewriteDocId(entry.docId()); - long rankEncodedId = docIdRewriter.rewriteDocId(docId); + var data = entry.readEntry(); + for (int i = 0; i + 1 < data.size(); i+=2) { + long wordId = data.get(i); + long meta = data.get(i+1); - long offset = offsetMap.addTo(wordId, RECORD_SIZE_LONGS); - outArray.set(offset + 0, rankEncodedId); - outArray.set(offset + 1, meta); - }); + long offset = offsetMap.addTo(wordId, RECORD_SIZE_LONGS); + + outArray.set(offset + 0, rankEncodedId); + outArray.set(offset + 1, meta); + } + } outArray.force(); } diff --git a/code/features-index/index-reverse/src/test/java/nu/marginalia/index/ReverseIndexReaderTest.java b/code/features-index/index-reverse/src/test/java/nu/marginalia/index/ReverseIndexReaderTest.java index 459086b9..3963fd2d 100644 --- a/code/features-index/index-reverse/src/test/java/nu/marginalia/index/ReverseIndexReaderTest.java +++ b/code/features-index/index-reverse/src/test/java/nu/marginalia/index/ReverseIndexReaderTest.java @@ -2,6 +2,7 @@ package nu.marginalia.index; import nu.marginalia.array.algo.SortingContext; import nu.marginalia.array.buffer.LongQueryBuffer; +import nu.marginalia.index.construction.DocIdRewriter; import nu.marginalia.index.construction.ReversePreindex; import nu.marginalia.index.construction.TestJournalFactory; import nu.marginalia.index.construction.TestJournalFactory.EntryDataWithWordMeta; @@ -93,7 +94,7 @@ class ReverseIndexReaderTest { private ReverseIndexReader createIndex(EntryDataWithWordMeta... scenario) throws IOException { var reader = journalFactory.createReader(scenario); - var preindex = ReversePreindex.constructPreindex(reader, tempDir, tempDir); + var preindex = ReversePreindex.constructPreindex(reader, DocIdRewriter.identity(), tempDir, tempDir); Path docsFile = tempDir.resolve("docs.dat"); diff --git a/code/features-index/index-reverse/src/test/java/nu/marginalia/index/construction/ReversePreindexFinalizeTest.java b/code/features-index/index-reverse/src/test/java/nu/marginalia/index/construction/ReversePreindexFinalizeTest.java index d72d8405..7cf22065 100644 --- a/code/features-index/index-reverse/src/test/java/nu/marginalia/index/construction/ReversePreindexFinalizeTest.java +++ b/code/features-index/index-reverse/src/test/java/nu/marginalia/index/construction/ReversePreindexFinalizeTest.java @@ -54,7 +54,7 @@ class ReversePreindexFinalizeTest { @Test public void testFinalizeSimple() throws IOException { var reader = journalFactory.createReader(new EntryDataWithWordMeta(100, 101, wm(50, 51))); - var preindex = ReversePreindex.constructPreindex(reader, tempDir, tempDir); + var preindex = ReversePreindex.constructPreindex(reader, DocIdRewriter.identity(), tempDir, tempDir); preindex.finalizeIndex(tempDir.resolve( "docs.dat"), tempDir.resolve("words.dat")); @@ -92,7 +92,7 @@ class ReversePreindexFinalizeTest { new EntryDataWithWordMeta(101, 101, wm(51, 52)) ); - var preindex = ReversePreindex.constructPreindex(reader, tempDir, tempDir); + var preindex = ReversePreindex.constructPreindex(reader, DocIdRewriter.identity(), tempDir, tempDir); preindex.finalizeIndex(tempDir.resolve( "docs.dat"), tempDir.resolve("words.dat")); preindex.delete(); diff --git a/code/features-index/index-reverse/src/test/java/nu/marginalia/index/construction/ReversePreindexMergeTest.java b/code/features-index/index-reverse/src/test/java/nu/marginalia/index/construction/ReversePreindexMergeTest.java index 89ad25ca..0a772b12 100644 --- a/code/features-index/index-reverse/src/test/java/nu/marginalia/index/construction/ReversePreindexMergeTest.java +++ b/code/features-index/index-reverse/src/test/java/nu/marginalia/index/construction/ReversePreindexMergeTest.java @@ -54,8 +54,8 @@ class ReversePreindexMergeTest { var reader1 = journalFactory.createReader(leftData.toArray(EntryDataWithWordMeta[]::new)); var reader2 = journalFactory.createReader(rightData.toArray(EntryDataWithWordMeta[]::new)); - var left = ReversePreindex.constructPreindex(reader1, tempDir, tempDir); - var right = ReversePreindex.constructPreindex(reader2, tempDir, tempDir); + var left = ReversePreindex.constructPreindex(reader1, DocIdRewriter.identity(), tempDir, tempDir); + var right = ReversePreindex.constructPreindex(reader2, DocIdRewriter.identity(), tempDir, tempDir); return ReversePreindex.merge(tempDir, left, right); } diff --git a/code/processes/index-constructor-process/src/main/java/nu/marginalia/index/IndexConstructorMain.java b/code/processes/index-constructor-process/src/main/java/nu/marginalia/index/IndexConstructorMain.java index e955ac5f..187a719a 100644 --- a/code/processes/index-constructor-process/src/main/java/nu/marginalia/index/IndexConstructorMain.java +++ b/code/processes/index-constructor-process/src/main/java/nu/marginalia/index/IndexConstructorMain.java @@ -6,6 +6,7 @@ import com.google.inject.Inject; import nu.marginalia.db.storage.FileStorageService; import nu.marginalia.db.storage.model.FileStorage; import nu.marginalia.db.storage.model.FileStorageType; +import nu.marginalia.index.construction.DocIdRewriter; import nu.marginalia.index.construction.ReverseIndexConstructor; import nu.marginalia.index.forward.ForwardIndexConverter; import nu.marginalia.index.forward.ForwardIndexFileNames; @@ -13,6 +14,7 @@ import nu.marginalia.index.journal.model.IndexJournalEntryData; import nu.marginalia.index.journal.reader.IndexJournalReadEntry; import nu.marginalia.index.journal.reader.IndexJournalReader; import nu.marginalia.model.gson.GsonFactory; +import nu.marginalia.model.id.UrlIdCodec; import nu.marginalia.mq.MessageQueueFactory; import nu.marginalia.mq.MqMessage; import nu.marginalia.mq.inbox.MqInboxResponse; @@ -105,6 +107,7 @@ public class IndexConstructorMain { ReverseIndexConstructor. createReverseIndex(IndexJournalReader::singleFile, indexStaging.asPath(), + this::addRank, tmpDir, outputFileDocs, outputFileWords); @@ -123,7 +126,7 @@ public class IndexConstructorMain { ReverseIndexConstructor. createReverseIndex(IndexJournalReader::singleFileWithPriorityFilters, - indexStaging.asPath(), tmpDir, outputFileDocs, outputFileWords); + indexStaging.asPath(), this::addRank, tmpDir, outputFileDocs, outputFileWords); } private void createForwardIndex() throws SQLException, IOException { @@ -144,7 +147,13 @@ public class IndexConstructorMain { converter.convert(); } + private long addRank(long docId) { + float rank = domainRankings.getSortRanking(docId); + return UrlIdCodec.addRank(rank, docId); + } + private class CreateIndexInstructions { + public final IndexName name; private final MqSingleShotInbox inbox; private final MqMessage message; diff --git a/code/services-core/index-service/src/test/java/nu/marginalia/index/svc/IndexQueryServiceIntegrationTest.java b/code/services-core/index-service/src/test/java/nu/marginalia/index/svc/IndexQueryServiceIntegrationTest.java index 803682ab..35cd9673 100644 --- a/code/services-core/index-service/src/test/java/nu/marginalia/index/svc/IndexQueryServiceIntegrationTest.java +++ b/code/services-core/index-service/src/test/java/nu/marginalia/index/svc/IndexQueryServiceIntegrationTest.java @@ -13,6 +13,7 @@ import nu.marginalia.index.client.model.query.SearchSubquery; import nu.marginalia.index.client.model.query.SearchSetIdentifier; import nu.marginalia.index.client.model.results.ResultRankingParameters; import nu.marginalia.index.client.model.results.SearchResultItem; +import nu.marginalia.index.construction.DocIdRewriter; import nu.marginalia.index.construction.ReverseIndexConstructor; import nu.marginalia.index.forward.ForwardIndexConverter; import nu.marginalia.index.forward.ForwardIndexFileNames; @@ -148,7 +149,7 @@ public class IndexQueryServiceIntegrationTest { ReverseIndexConstructor. - createReverseIndex(IndexJournalReader::singleFile, indexStaging.asPath(), tmpDir, outputFileDocs, outputFileWords); + createReverseIndex(IndexJournalReader::singleFile, indexStaging.asPath(), DocIdRewriter.identity(), tmpDir, outputFileDocs, outputFileWords); } private void createPrioReverseIndex() throws SQLException, IOException { @@ -163,7 +164,7 @@ public class IndexQueryServiceIntegrationTest { if (!Files.isDirectory(tmpDir)) Files.createDirectories(tmpDir); ReverseIndexConstructor. - createReverseIndex(IndexJournalReader::singleFile, indexStaging.asPath(), tmpDir, outputFileDocs, outputFileWords); + createReverseIndex(IndexJournalReader::singleFile, indexStaging.asPath(), DocIdRewriter.identity(), tmpDir, outputFileDocs, outputFileWords); } private void createForwardIndex() throws SQLException, IOException {