diff --git a/code/index/index-reverse/java/nu/marginalia/index/ReverseIndexReader.java b/code/index/index-reverse/java/nu/marginalia/index/FullReverseIndexReader.java similarity index 94% rename from code/index/index-reverse/java/nu/marginalia/index/ReverseIndexReader.java rename to code/index/index-reverse/java/nu/marginalia/index/FullReverseIndexReader.java index da3cb1fe..ce70be2d 100644 --- a/code/index/index-reverse/java/nu/marginalia/index/ReverseIndexReader.java +++ b/code/index/index-reverse/java/nu/marginalia/index/FullReverseIndexReader.java @@ -21,7 +21,7 @@ import java.nio.file.Files; import java.nio.file.Path; import java.util.concurrent.Executors; -public class ReverseIndexReader { +public class FullReverseIndexReader { private final LongArray words; private final LongArray documents; private final long wordsDataOffset; @@ -31,10 +31,10 @@ public class ReverseIndexReader { private final PositionsFileReader positionsFileReader; - public ReverseIndexReader(String name, - Path words, - Path documents, - PositionsFileReader positionsFileReader) throws IOException { + public FullReverseIndexReader(String name, + Path words, + Path documents, + PositionsFileReader positionsFileReader) throws IOException { this.name = name; this.positionsFileReader = positionsFileReader; @@ -138,7 +138,7 @@ public class ReverseIndexReader { private BTreeReader createReaderNew(long offset) { return new BTreeReader( documents, - ReverseIndexParameters.docsBTreeContext, + ReverseIndexParameters.fullDocsBTreeContext, offset); } diff --git a/code/index/index-reverse/java/nu/marginalia/index/PrioReverseIndexReader.java b/code/index/index-reverse/java/nu/marginalia/index/PrioReverseIndexReader.java new file mode 100644 index 00000000..4fd7ed3f --- /dev/null +++ b/code/index/index-reverse/java/nu/marginalia/index/PrioReverseIndexReader.java @@ -0,0 +1,99 @@ +package nu.marginalia.index; + +import nu.marginalia.array.LongArray; +import nu.marginalia.array.LongArrayFactory; +import nu.marginalia.btree.BTreeReader; +import nu.marginalia.index.query.EmptyEntrySource; +import nu.marginalia.index.query.EntrySource; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; + +import java.io.IOException; +import java.nio.file.Files; +import java.nio.file.Path; + +public class PrioReverseIndexReader { + private final LongArray words; + private final LongArray documents; + private final long wordsDataOffset; + private final Logger logger = LoggerFactory.getLogger(getClass()); + private final BTreeReader wordsBTreeReader; + private final String name; + + public PrioReverseIndexReader(String name, + Path words, + Path documents) throws IOException { + this.name = name; + + if (!Files.exists(words) || !Files.exists(documents)) { + this.words = null; + this.documents = null; + this.wordsBTreeReader = null; + this.wordsDataOffset = -1; + return; + } + + logger.info("Switching reverse index"); + + this.words = LongArrayFactory.mmapForReadingShared(words); + this.documents = LongArrayFactory.mmapForReadingShared(documents); + + wordsBTreeReader = new BTreeReader(this.words, ReverseIndexParameters.wordsBTreeContext, 0); + wordsDataOffset = wordsBTreeReader.getHeader().dataOffsetLongs(); + + } + + /** Calculate the offset of the word in the documents. + * If the return-value is negative, the term does not exist + * in the index. + */ + long wordOffset(long termId) { + long idx = wordsBTreeReader.findEntry(termId); + + if (idx < 0) + return -1L; + + return words.get(wordsDataOffset + idx + 1); + } + + public EntrySource documents(long termId) { + if (null == words) { + logger.warn("Reverse index is not ready, dropping query"); + return new EmptyEntrySource(); + } + + long offset = wordOffset(termId); + + if (offset < 0) // No documents + return new EmptyEntrySource(); + + return new ReverseIndexEntrySource(name, createReaderNew(offset), 1, termId); + } + + /** Return the number of documents with the termId in the index */ + public int numDocuments(long termId) { + long offset = wordOffset(termId); + + if (offset < 0) + return 0; + + return createReaderNew(offset).numEntries(); + } + + /** Create a BTreeReader for the document offset associated with a termId */ + private BTreeReader createReaderNew(long offset) { + return new BTreeReader( + documents, + ReverseIndexParameters.prioDocsBTreeContext, + offset); + } + + public void close() { + if (documents != null) + documents.close(); + + if (words != null) + words.close(); + } + +} diff --git a/code/index/index-reverse/java/nu/marginalia/index/ReverseIndexParameters.java b/code/index/index-reverse/java/nu/marginalia/index/ReverseIndexParameters.java index a6df15d3..6de56e0c 100644 --- a/code/index/index-reverse/java/nu/marginalia/index/ReverseIndexParameters.java +++ b/code/index/index-reverse/java/nu/marginalia/index/ReverseIndexParameters.java @@ -5,6 +5,7 @@ import nu.marginalia.btree.model.BTreeContext; public class ReverseIndexParameters { - public static final BTreeContext docsBTreeContext = new BTreeContext(5, 2, BTreeBlockSize.BS_2048); + public static final BTreeContext prioDocsBTreeContext = new BTreeContext(5, 1, BTreeBlockSize.BS_2048); + public static final BTreeContext fullDocsBTreeContext = new BTreeContext(5, 2, BTreeBlockSize.BS_2048); public static final BTreeContext wordsBTreeContext = new BTreeContext(5, 2, BTreeBlockSize.BS_2048); } diff --git a/code/index/index-reverse/java/nu/marginalia/index/ReverseIndexSelfTest.java b/code/index/index-reverse/java/nu/marginalia/index/ReverseIndexSelfTest.java index 61dee824..06251aca 100644 --- a/code/index/index-reverse/java/nu/marginalia/index/ReverseIndexSelfTest.java +++ b/code/index/index-reverse/java/nu/marginalia/index/ReverseIndexSelfTest.java @@ -22,7 +22,7 @@ public class ReverseIndexSelfTest { public static void runSelfTest2(LongArray wordsDataRange, LongArray documents) { logger.info("Starting test 2"); for (long i = 1; i < wordsDataRange.size(); i+=2) { - var docsBTreeReader = new BTreeReader(documents, ReverseIndexParameters.docsBTreeContext, wordsDataRange.get(i)); + var docsBTreeReader = new BTreeReader(documents, ReverseIndexParameters.fullDocsBTreeContext, wordsDataRange.get(i)); var header = docsBTreeReader.getHeader(); var docRange = documents.range(header.dataOffsetLongs(), header.dataOffsetLongs() + header.numEntries() * 2L); @@ -49,7 +49,7 @@ public class ReverseIndexSelfTest { public static void runSelfTest4(LongArray wordsDataRange, LongArray documents) { logger.info("Starting test 4"); for (long i = 1; i < wordsDataRange.size(); i+=2) { - var docsBTreeReader = new BTreeReader(documents, ReverseIndexParameters.docsBTreeContext, wordsDataRange.get(i)); + var docsBTreeReader = new BTreeReader(documents, ReverseIndexParameters.fullDocsBTreeContext, wordsDataRange.get(i)); var header = docsBTreeReader.getHeader(); var docRange = documents.range(header.dataOffsetLongs(), header.dataOffsetLongs() + header.numEntries() * 2L); for (int j = 0; j < docRange.size(); j+=2) { @@ -84,7 +84,7 @@ public class ReverseIndexSelfTest { public static void runSelfTest6(LongArray wordsDataRange, LongArray documents) { logger.info("Starting test 6"); for (long i = 1; i < wordsDataRange.size(); i+=2) { - var docsBTreeReader = new BTreeReader(documents, ReverseIndexParameters.docsBTreeContext, wordsDataRange.get(i)); + var docsBTreeReader = new BTreeReader(documents, ReverseIndexParameters.fullDocsBTreeContext, wordsDataRange.get(i)); var header = docsBTreeReader.getHeader(); var docRange = documents.range(header.dataOffsetLongs(), header.dataOffsetLongs() + header.numEntries() * 2L); Long prev = null; diff --git a/code/index/index-reverse/java/nu/marginalia/index/construction/full/FullPreindex.java b/code/index/index-reverse/java/nu/marginalia/index/construction/full/FullPreindex.java index 668263d8..063324d2 100644 --- a/code/index/index-reverse/java/nu/marginalia/index/construction/full/FullPreindex.java +++ b/code/index/index-reverse/java/nu/marginalia/index/construction/full/FullPreindex.java @@ -82,7 +82,7 @@ public class FullPreindex { // Estimate the size of the docs index data offsets.transformEach(0, offsets.size(), new CountToOffsetTransformer(2)); - IndexSizeEstimator sizeEstimator = new IndexSizeEstimator(ReverseIndexParameters.docsBTreeContext, 2); + IndexSizeEstimator sizeEstimator = new IndexSizeEstimator(ReverseIndexParameters.fullDocsBTreeContext, 2); offsets.fold(0, 0, offsets.size(), sizeEstimator); // Write the docs file @@ -90,7 +90,7 @@ public class FullPreindex { try (var intermediateDocChannel = documents.createDocumentsFileChannel()) { offsets.transformEachIO(0, offsets.size(), new FullIndexBTreeTransformer(finalDocs, 2, - ReverseIndexParameters.docsBTreeContext, + ReverseIndexParameters.fullDocsBTreeContext, intermediateDocChannel)); intermediateDocChannel.force(false); } diff --git a/code/index/index-reverse/java/nu/marginalia/index/construction/full/FullPreindexDocuments.java b/code/index/index-reverse/java/nu/marginalia/index/construction/full/FullPreindexDocuments.java index 49442367..bae7990a 100644 --- a/code/index/index-reverse/java/nu/marginalia/index/construction/full/FullPreindexDocuments.java +++ b/code/index/index-reverse/java/nu/marginalia/index/construction/full/FullPreindexDocuments.java @@ -20,7 +20,7 @@ import java.util.concurrent.Executors; import java.util.concurrent.TimeUnit; /** A LongArray with document data, segmented according to - * the associated ReversePreindexWordSegments data + * the associated FullPreindexWordSegments data */ public class FullPreindexDocuments { public final LongArray documents; diff --git a/code/index/index-reverse/java/nu/marginalia/index/construction/prio/PrioIndexConstructor.java b/code/index/index-reverse/java/nu/marginalia/index/construction/prio/PrioIndexConstructor.java index 4cad80b9..e853fb50 100644 --- a/code/index/index-reverse/java/nu/marginalia/index/construction/prio/PrioIndexConstructor.java +++ b/code/index/index-reverse/java/nu/marginalia/index/construction/prio/PrioIndexConstructor.java @@ -55,8 +55,7 @@ public class PrioIndexConstructor { } try (var heartbeat = processHeartbeat.createProcessTaskHeartbeat(CreateReverseIndexSteps.class, processName); - var preindexHeartbeat = processHeartbeat.createAdHocTaskHeartbeat("constructPreindexes"); - var posConstructor = new PositionsFileConstructor(outputFilePositions) + var preindexHeartbeat = processHeartbeat.createAdHocTaskHeartbeat("constructPreindexes") ) { heartbeat.progress(CreateReverseIndexSteps.CONSTRUCT); @@ -66,7 +65,7 @@ public class PrioIndexConstructor { .parallelStream() .map(in -> { preindexHeartbeat.progress("PREINDEX/MERGE", progress.incrementAndGet(), inputs.size()); - return construct(in, posConstructor); + return construct(in); }) .reduce(this::merge) .ifPresent((index) -> { @@ -80,9 +79,9 @@ public class PrioIndexConstructor { } @SneakyThrows - private PrioPreindexReference construct(Path input, PositionsFileConstructor positionsFileConstructor) { + private PrioPreindexReference construct(Path input) { return PrioPreindex - .constructPreindex(readerSource.construct(input), positionsFileConstructor, docIdRewriter, tmpDir) + .constructPreindex(readerSource.construct(input), docIdRewriter, tmpDir) .closeToReference(); } diff --git a/code/index/index-reverse/java/nu/marginalia/index/construction/prio/PrioPreindex.java b/code/index/index-reverse/java/nu/marginalia/index/construction/prio/PrioPreindex.java index f5449231..64929510 100644 --- a/code/index/index-reverse/java/nu/marginalia/index/construction/prio/PrioPreindex.java +++ b/code/index/index-reverse/java/nu/marginalia/index/construction/prio/PrioPreindex.java @@ -7,7 +7,6 @@ import nu.marginalia.index.ReverseIndexParameters; import nu.marginalia.index.construction.CountToOffsetTransformer; import nu.marginalia.index.construction.DocIdRewriter; import nu.marginalia.index.construction.IndexSizeEstimator; -import nu.marginalia.index.construction.PositionsFileConstructor; import nu.marginalia.index.journal.reader.IndexJournalReader; import org.slf4j.Logger; import org.slf4j.LoggerFactory; @@ -44,7 +43,6 @@ public class PrioPreindex { * will have randomly assigned names. */ public static PrioPreindex constructPreindex(IndexJournalReader reader, - PositionsFileConstructor positionsFileConstructor, DocIdRewriter docIdRewriter, Path workDir) throws IOException { @@ -53,7 +51,7 @@ public class PrioPreindex { Path docsFile = Files.createTempFile(workDir, "docs", ".dat"); var segments = PrioPreindexWordSegments.construct(reader, segmentWordsFile, segmentCountsFile); - var docs = PrioPreindexDocuments.construct(docsFile, workDir, reader, docIdRewriter, positionsFileConstructor, segments); + var docs = PrioPreindexDocuments.construct(docsFile, workDir, reader, docIdRewriter, segments); return new PrioPreindex(segments, docs); } @@ -81,16 +79,16 @@ public class PrioPreindex { Files.deleteIfExists(outputFileWords); // Estimate the size of the docs index data - offsets.transformEach(0, offsets.size(), new CountToOffsetTransformer(2)); - IndexSizeEstimator sizeEstimator = new IndexSizeEstimator(ReverseIndexParameters.docsBTreeContext, 2); + offsets.transformEach(0, offsets.size(), new CountToOffsetTransformer(1)); + IndexSizeEstimator sizeEstimator = new IndexSizeEstimator(ReverseIndexParameters.prioDocsBTreeContext, 1); offsets.fold(0, 0, offsets.size(), sizeEstimator); // Write the docs file LongArray finalDocs = LongArrayFactory.mmapForWritingConfined(outputFileDocs, sizeEstimator.size); try (var intermediateDocChannel = documents.createDocumentsFileChannel()) { offsets.transformEachIO(0, offsets.size(), - new PrioIndexBTreeTransformer(finalDocs, 2, - ReverseIndexParameters.docsBTreeContext, + new PrioIndexBTreeTransformer(finalDocs, 1, + ReverseIndexParameters.prioDocsBTreeContext, intermediateDocChannel)); intermediateDocChannel.force(false); } @@ -137,9 +135,9 @@ public class PrioPreindex { PrioPreindexWordSegments mergingSegment = createMergedSegmentWordFile(destDir, left.segments, right.segments); - var mergingIter = mergingSegment.constructionIterator(2); - var leftIter = left.segments.iterator(2); - var rightIter = right.segments.iterator(2); + var mergingIter = mergingSegment.constructionIterator(1); + var leftIter = left.segments.iterator(1); + var rightIter = right.segments.iterator(1); Path docsFile = Files.createTempFile(destDir, "docs", ".dat"); @@ -200,7 +198,7 @@ public class PrioPreindex { // duplicates in the data, so we need to shrink it to the actual size we wrote. mergedDocuments = shrinkMergedDocuments(mergedDocuments, - docsFile, 2 * mergingSegment.totalSize()); + docsFile, mergingSegment.totalSize()); return new PrioPreindex( mergingSegment, @@ -274,8 +272,7 @@ public class PrioPreindex { leftIter.startOffset, leftIter.endOffset, rightIter.startOffset, rightIter.endOffset); - long distinct = segSize / 2; - destIter.putNext(distinct); + destIter.putNext(segSize); leftIter.next(); rightIter.next(); } @@ -297,7 +294,7 @@ public class PrioPreindex { mergingIter.startOffset, end); - boolean putNext = mergingIter.putNext(size / 2); + boolean putNext = mergingIter.putNext(size); boolean iterNext = sourceIter.next(); if (!putNext && iterNext) diff --git a/code/index/index-reverse/java/nu/marginalia/index/construction/prio/PrioPreindexDocuments.java b/code/index/index-reverse/java/nu/marginalia/index/construction/prio/PrioPreindexDocuments.java index 03edb4b4..186d0d65 100644 --- a/code/index/index-reverse/java/nu/marginalia/index/construction/prio/PrioPreindexDocuments.java +++ b/code/index/index-reverse/java/nu/marginalia/index/construction/prio/PrioPreindexDocuments.java @@ -4,7 +4,6 @@ import lombok.SneakyThrows; import nu.marginalia.array.LongArray; import nu.marginalia.array.LongArrayFactory; import nu.marginalia.index.construction.DocIdRewriter; -import nu.marginalia.index.construction.PositionsFileConstructor; import nu.marginalia.index.journal.reader.IndexJournalReader; import nu.marginalia.rwf.RandomFileAssembler; import org.slf4j.Logger; @@ -20,13 +19,12 @@ import java.util.concurrent.Executors; import java.util.concurrent.TimeUnit; /** A LongArray with document data, segmented according to - * the associated ReversePreindexWordSegments data + * the associated FullPreindexWordSegments data */ public class PrioPreindexDocuments { public final LongArray documents; - private static PositionsFileConstructor positionsFileConstructor; - private static final int RECORD_SIZE_LONGS = 2; + private static final int RECORD_SIZE_LONGS = 1; private static final Logger logger = LoggerFactory.getLogger(PrioPreindexDocuments.class); public final Path file; @@ -41,9 +39,7 @@ public class PrioPreindexDocuments { Path workDir, IndexJournalReader reader, DocIdRewriter docIdRewriter, - PositionsFileConstructor positionsFileConstructor, PrioPreindexWordSegments segments) throws IOException { - PrioPreindexDocuments.positionsFileConstructor = positionsFileConstructor; createUnsortedDocsFile(docsFile, workDir, reader, segments, docIdRewriter); @@ -88,11 +84,7 @@ public class PrioPreindexDocuments { long offset = offsetMap.addTo(termId, RECORD_SIZE_LONGS); - // write position data to the positions file and get the offset - long encodedPosOffset = positionsFileConstructor.add((byte) termData.metadata(), termData.positionsBuffer()); - - assembly.put(offset + 0, rankEncodedId); - assembly.put(offset + 1, encodedPosOffset); + assembly.put(offset, rankEncodedId); } } @@ -112,11 +104,10 @@ public class PrioPreindexDocuments { long iterEnd = iter.endOffset; if (iter.size() < 1024) { - docsFileMap.quickSortN(RECORD_SIZE_LONGS, iterStart, iterEnd); + docsFileMap.sort(iterStart, iterEnd); } else { - sortingWorkers.execute(() -> - docsFileMap.quickSortN(RECORD_SIZE_LONGS, iterStart, iterEnd)); + sortingWorkers.execute(() -> docsFileMap.sort(iterStart, iterEnd)); } } diff --git a/code/index/index-reverse/test/nu/marginalia/index/ReverseIndexReaderTest.java b/code/index/index-reverse/test/nu/marginalia/index/FullReverseIndexReaderTest.java similarity index 91% rename from code/index/index-reverse/test/nu/marginalia/index/ReverseIndexReaderTest.java rename to code/index/index-reverse/test/nu/marginalia/index/FullReverseIndexReaderTest.java index 5047da90..6cf4349c 100644 --- a/code/index/index-reverse/test/nu/marginalia/index/ReverseIndexReaderTest.java +++ b/code/index/index-reverse/test/nu/marginalia/index/FullReverseIndexReaderTest.java @@ -22,7 +22,7 @@ import java.util.List; import static nu.marginalia.index.construction.full.TestJournalFactory.wm; import static org.junit.jupiter.api.Assertions.*; -class ReverseIndexReaderTest { +class FullReverseIndexReaderTest { TestJournalFactory journalFactory; Path tempDir; @@ -82,7 +82,7 @@ class ReverseIndexReaderTest { } - private long[] readEntries(ReverseIndexReader reader, long wordId) { + private long[] readEntries(FullReverseIndexReader reader, long wordId) { var es = reader.documents(wordId); assertTrue(es.hasMore()); LongQueryBuffer buffer = new LongQueryBuffer(4); @@ -91,7 +91,7 @@ class ReverseIndexReaderTest { return buffer.copyData(); } - private ReverseIndexReader createIndex(EntryDataWithWordMeta... scenario) throws IOException { + private FullReverseIndexReader createIndex(EntryDataWithWordMeta... scenario) throws IOException { var reader = journalFactory.createReader(scenario); Path posFile = tempDir.resolve("positions.dat"); @@ -106,7 +106,7 @@ class ReverseIndexReaderTest { preindex.delete(); } - return new ReverseIndexReader("test", wordsFile, docsFile, new PositionsFileReader(posFile)); + return new FullReverseIndexReader("test", wordsFile, docsFile, new PositionsFileReader(posFile)); } } \ No newline at end of file diff --git a/code/index/index-reverse/test/nu/marginalia/index/ReverseIndexDebugTest.java b/code/index/index-reverse/test/nu/marginalia/index/ReverseIndexDebugTest.java index 6f612a06..359e9396 100644 --- a/code/index/index-reverse/test/nu/marginalia/index/ReverseIndexDebugTest.java +++ b/code/index/index-reverse/test/nu/marginalia/index/ReverseIndexDebugTest.java @@ -26,7 +26,7 @@ public class ReverseIndexDebugTest { long wordOffset = wordsBTreeReader.findEntry(problemWord); assertTrue(wordOffset >= 0); - var docsReader = new BTreeReader(documents, ReverseIndexParameters.docsBTreeContext, wordOffset); + var docsReader = new BTreeReader(documents, ReverseIndexParameters.prioDocsBTreeContext, wordOffset); // We find problemDoc even though it doesn't exist in the document range long docOffset = docsReader.findEntry(problemDoc); diff --git a/code/index/index-reverse/test/nu/marginalia/index/construction/full/TestJournalFactory.java b/code/index/index-reverse/test/nu/marginalia/index/construction/full/TestJournalFactory.java index 48bd8bc0..f34dcd9c 100644 --- a/code/index/index-reverse/test/nu/marginalia/index/construction/full/TestJournalFactory.java +++ b/code/index/index-reverse/test/nu/marginalia/index/construction/full/TestJournalFactory.java @@ -58,7 +58,7 @@ public class TestJournalFactory { return new WordWithMeta(wordId, meta, GammaCodedSequence.generate(ByteBuffer.allocate(128), positions)); } - IndexJournalReader createReader(EntryData... entries) throws IOException { + public IndexJournalReader createReader(EntryData... entries) throws IOException { Path jf = Files.createTempFile(tempDir, "journal", ".dat"); var writer = new IndexJournalWriterSingleFileImpl(jf); diff --git a/code/index/index-reverse/test/nu/marginalia/index/construction/prio/FullPreindexTest.java b/code/index/index-reverse/test/nu/marginalia/index/construction/prio/FullPreindexTest.java new file mode 100644 index 00000000..24c83553 --- /dev/null +++ b/code/index/index-reverse/test/nu/marginalia/index/construction/prio/FullPreindexTest.java @@ -0,0 +1,86 @@ +package nu.marginalia.index.construction.prio; + +import nu.marginalia.array.page.LongQueryBuffer; +import nu.marginalia.index.PrioReverseIndexReader; +import nu.marginalia.index.construction.DocIdRewriter; +import nu.marginalia.index.construction.full.TestJournalFactory; +import org.junit.jupiter.api.AfterEach; +import org.junit.jupiter.api.BeforeEach; +import org.junit.jupiter.api.Test; + +import java.io.IOException; +import java.nio.file.Files; +import java.nio.file.Path; +import java.util.ArrayList; +import java.util.List; + +import static nu.marginalia.index.construction.full.TestJournalFactory.*; +import static nu.marginalia.index.construction.full.TestJournalFactory.wm; +import static org.junit.jupiter.api.Assertions.assertEquals; +import static org.junit.jupiter.api.Assertions.assertTrue; + +class FullPreindexTest { + Path countsFile; + Path wordsIdFile; + Path docsFile; + Path tempDir; + Path positionsFile; + + TestJournalFactory journalFactory; + + @BeforeEach + public void setUp() throws IOException { + journalFactory = new TestJournalFactory(); + + countsFile = Files.createTempFile("counts", ".dat"); + wordsIdFile = Files.createTempFile("words", ".dat"); + docsFile = Files.createTempFile("docs", ".dat"); + tempDir = Files.createTempDirectory("sort"); + positionsFile = tempDir.resolve("positions.dat"); + } + + @AfterEach + public void tearDown() throws IOException { + journalFactory.clear(); + + Files.deleteIfExists(countsFile); + Files.deleteIfExists(wordsIdFile); + Files.deleteIfExists(positionsFile); + Files.deleteIfExists(docsFile); + + List contents = new ArrayList<>(); + Files.list(tempDir).forEach(contents::add); + for (var tempFile : contents) { + Files.delete(tempFile); + } + Files.delete(tempDir); + } + + @Test + public void testFinalizeSimple() throws IOException { + var journalReader = journalFactory.createReader( + new EntryDataWithWordMeta(100, 101, wm(50, 51)), + new EntryDataWithWordMeta(104, 101, wm(50, 52)) + ); + + var preindex = PrioPreindex.constructPreindex(journalReader, DocIdRewriter.identity(), tempDir); + preindex.finalizeIndex(tempDir.resolve( "docs.dat"), tempDir.resolve("words.dat")); + preindex.delete(); + + Path wordsFile = tempDir.resolve("words.dat"); + Path docsFile = tempDir.resolve("docs.dat"); + + assertTrue(Files.exists(wordsFile)); + assertTrue(Files.exists(docsFile)); + + var indexReader = new PrioReverseIndexReader("test", wordsFile, docsFile); + + var entrySource = indexReader.documents(50); + var lqb = new LongQueryBuffer(32); + entrySource.read(lqb); + + assertEquals(2, lqb.size()); + assertEquals(100, lqb.copyData()[0]); + assertEquals(104, lqb.copyData()[1]); + } +} \ No newline at end of file diff --git a/code/index/java/nu/marginalia/index/IndexFactory.java b/code/index/java/nu/marginalia/index/IndexFactory.java index 38fed31e..14e62380 100644 --- a/code/index/java/nu/marginalia/index/IndexFactory.java +++ b/code/index/java/nu/marginalia/index/IndexFactory.java @@ -38,19 +38,18 @@ public class IndexFactory { return IndexLocations.getSearchSetsPath(fileStorageService); } - public ReverseIndexReader getReverseIndexReader() throws IOException { - return new ReverseIndexReader("full", + public FullReverseIndexReader getReverseIndexReader() throws IOException { + return new FullReverseIndexReader("full", ReverseIndexFullFileNames.resolve(liveStorage, ReverseIndexFullFileNames.FileIdentifier.WORDS, ReverseIndexFullFileNames.FileVersion.CURRENT), ReverseIndexFullFileNames.resolve(liveStorage, ReverseIndexFullFileNames.FileIdentifier.DOCS, ReverseIndexFullFileNames.FileVersion.CURRENT), new PositionsFileReader(ReverseIndexFullFileNames.resolve(liveStorage, ReverseIndexFullFileNames.FileIdentifier.POSITIONS, ReverseIndexFullFileNames.FileVersion.CURRENT)) ); } - public ReverseIndexReader getReverseIndexPrioReader() throws IOException { - return new ReverseIndexReader("prio", + public PrioReverseIndexReader getReverseIndexPrioReader() throws IOException { + return new PrioReverseIndexReader("prio", ReverseIndexPrioFileNames.resolve(liveStorage, ReverseIndexPrioFileNames.FileIdentifier.WORDS, ReverseIndexPrioFileNames.FileVersion.CURRENT), - ReverseIndexPrioFileNames.resolve(liveStorage, ReverseIndexPrioFileNames.FileIdentifier.DOCS, ReverseIndexPrioFileNames.FileVersion.CURRENT), - null + ReverseIndexPrioFileNames.resolve(liveStorage, ReverseIndexPrioFileNames.FileIdentifier.DOCS, ReverseIndexPrioFileNames.FileVersion.CURRENT) ); } diff --git a/code/index/java/nu/marginalia/index/index/CombinedIndexReader.java b/code/index/java/nu/marginalia/index/index/CombinedIndexReader.java index 5779b526..01a5fd06 100644 --- a/code/index/java/nu/marginalia/index/index/CombinedIndexReader.java +++ b/code/index/java/nu/marginalia/index/index/CombinedIndexReader.java @@ -5,7 +5,8 @@ import it.unimi.dsi.fastutil.longs.LongList; import it.unimi.dsi.fastutil.longs.LongOpenHashSet; import it.unimi.dsi.fastutil.longs.LongSet; import nu.marginalia.api.searchquery.model.compiled.aggregate.CompiledQueryAggregates; -import nu.marginalia.index.ReverseIndexReader; +import nu.marginalia.index.FullReverseIndexReader; +import nu.marginalia.index.PrioReverseIndexReader; import nu.marginalia.index.forward.ForwardIndexReader; import nu.marginalia.index.model.QueryParams; import nu.marginalia.index.model.SearchTerms; @@ -38,30 +39,25 @@ public class CombinedIndexReader { private final Logger logger = LoggerFactory.getLogger(getClass()); private final ForwardIndexReader forwardIndexReader; - private final ReverseIndexReader reverseIndexFullReader; - private final ReverseIndexReader reverseIndexPriorityReader; + private final FullReverseIndexReader reverseIndexFullReader; + private final PrioReverseIndexReader reverseIndexPriorityReader; public CombinedIndexReader(ForwardIndexReader forwardIndexReader, - ReverseIndexReader reverseIndexFullReader, - ReverseIndexReader reverseIndexPriorityReader) { + FullReverseIndexReader reverseIndexFullReader, + PrioReverseIndexReader reverseIndexPriorityReader) { this.forwardIndexReader = forwardIndexReader; this.reverseIndexFullReader = reverseIndexFullReader; this.reverseIndexPriorityReader = reverseIndexPriorityReader; } public IndexQueryBuilderImpl newQueryBuilder(IndexQuery query) { - return new IndexQueryBuilderImpl(reverseIndexFullReader, reverseIndexPriorityReader, query); + return new IndexQueryBuilderImpl(reverseIndexFullReader, query); } public QueryFilterStepIf hasWordFull(long termId) { return reverseIndexFullReader.also(termId); } - public QueryFilterStepIf hasWordPrio(long termId) { - return reverseIndexPriorityReader.also(termId); - } - - /** Creates a query builder for terms in the priority index */ public IndexQueryBuilder findPriorityWord(long wordId) { return newQueryBuilder(new IndexQuery(reverseIndexPriorityReader.documents(wordId))) @@ -124,7 +120,7 @@ public class CombinedIndexReader { if (paths.size() < 4) { var prioHead = findPriorityWord(elements.getLong(0)); for (int i = 1; i < elements.size(); i++) { - prioHead.addInclusionFilter(hasWordPrio(elements.getLong(i))); + prioHead.addInclusionFilter(hasWordFull(elements.getLong(i))); } queryHeads.add(prioHead); } diff --git a/code/index/java/nu/marginalia/index/index/IndexQueryBuilderImpl.java b/code/index/java/nu/marginalia/index/index/IndexQueryBuilderImpl.java index 0f63fdbc..cd416ca3 100644 --- a/code/index/java/nu/marginalia/index/index/IndexQueryBuilderImpl.java +++ b/code/index/java/nu/marginalia/index/index/IndexQueryBuilderImpl.java @@ -2,7 +2,8 @@ package nu.marginalia.index.index; import java.util.List; import gnu.trove.set.hash.TLongHashSet; -import nu.marginalia.index.ReverseIndexReader; +import nu.marginalia.index.FullReverseIndexReader; +import nu.marginalia.index.PrioReverseIndexReader; import nu.marginalia.index.query.IndexQuery; import nu.marginalia.index.query.IndexQueryBuilder; import nu.marginalia.index.query.filter.QueryFilterAnyOf; @@ -10,8 +11,7 @@ import nu.marginalia.index.query.filter.QueryFilterStepIf; public class IndexQueryBuilderImpl implements IndexQueryBuilder { private final IndexQuery query; - private final ReverseIndexReader reverseIndexFullReader; - private final ReverseIndexReader reverseIndexPrioReader; + private final FullReverseIndexReader reverseIndexFullReader; /* Keep track of already added include terms to avoid redundant checks. * @@ -21,13 +21,10 @@ public class IndexQueryBuilderImpl implements IndexQueryBuilder { * */ private final TLongHashSet alreadyConsideredTerms = new TLongHashSet(); - IndexQueryBuilderImpl(ReverseIndexReader reverseIndexFullReader, - ReverseIndexReader reverseIndexPrioReader, - IndexQuery query) + IndexQueryBuilderImpl(FullReverseIndexReader reverseIndexFullReader, IndexQuery query) { this.query = query; this.reverseIndexFullReader = reverseIndexFullReader; - this.reverseIndexPrioReader = reverseIndexPrioReader; } public IndexQueryBuilder withSourceTerms(long... sourceTerms) { diff --git a/code/tools/integration-test/test/nu/marginalia/IntegrationTest.java b/code/tools/integration-test/test/nu/marginalia/IntegrationTest.java index 2cd178f2..3cf8a10d 100644 --- a/code/tools/integration-test/test/nu/marginalia/IntegrationTest.java +++ b/code/tools/integration-test/test/nu/marginalia/IntegrationTest.java @@ -18,6 +18,7 @@ import nu.marginalia.index.IndexGrpcService; import nu.marginalia.index.ReverseIndexFullFileNames; import nu.marginalia.index.ReverseIndexPrioFileNames; import nu.marginalia.index.construction.full.FullIndexConstructor; +import nu.marginalia.index.construction.prio.PrioIndexConstructor; import nu.marginalia.index.domainrankings.DomainRankings; import nu.marginalia.index.forward.ForwardIndexConverter; import nu.marginalia.index.forward.ForwardIndexFileNames; @@ -269,7 +270,7 @@ public class IntegrationTest { // important to the document. This filter will act on the encoded {@see WordMetadata} LongPredicate wordMetaFilter = getPriorityIndexWordMetaFilter(); - var constructor = new FullIndexConstructor( + var constructor = new PrioIndexConstructor( outputFileDocs, outputFileWords, outputFilePositions,