diff --git a/code/common/model/readme.md b/code/common/model/readme.md
index c532e788..e58b9a3e 100644
--- a/code/common/model/readme.md
+++ b/code/common/model/readme.md
@@ -8,4 +8,6 @@ This package contains common models to the search engine
* [EdgeUrl](src/main/java/nu/marginalia/model/EdgeUrl.java)
* [EdgeId](src/main/java/nu/marginalia/model/id/EdgeId.java)
* [DocumentMetadata](src/main/java/nu/marginalia/model/idx/DocumentMetadata.java)
-* [WordMetadata](src/main/java/nu/marginalia/model/idx/WordMetadata.java)
\ No newline at end of file
+* [DocumentFlags](src/main/java/nu/marginalia/model/idx/DocumentFlags.java)
+* [WordMetadata](src/main/java/nu/marginalia/model/idx/WordMetadata.java)
+* [WordFlags](src/main/java/nu/marginalia/model/idx/WordFlags.java)
\ No newline at end of file
diff --git a/code/features-convert/keyword-extraction/readme.md b/code/features-convert/keyword-extraction/readme.md
index f67702a1..17ad8600 100644
--- a/code/features-convert/keyword-extraction/readme.md
+++ b/code/features-convert/keyword-extraction/readme.md
@@ -7,6 +7,7 @@ functions based on [POS tags](https://www.ling.upenn.edu/courses/Fall_2003/ling0
## Central Classes
* [DocumentKeywordExtractor](src/main/java/nu/marginalia/keyword/DocumentKeywordExtractor.java)
+* [KeywordMetadata](src/main/java/nu/marginalia/keyword/KeywordMetadata.java)
## See Also
diff --git a/code/features-index/index-reverse/readme.md b/code/features-index/index-reverse/readme.md
index d3782b07..f42307c0 100644
--- a/code/features-index/index-reverse/readme.md
+++ b/code/features-index/index-reverse/readme.md
@@ -1,9 +1,17 @@
# Reverse Index
-The reverse index contains a mapping from word to document id. It also provides access to
-term-level metadata.
+The reverse index contains a mapping from word to document id.
+
+There are two tiers of this index, one priority index which only indexes terms that are flagged with priority flags1,
+and a full index that indexes all terms. The full index also provides access to term-level metadata, while the priority
+index is a binary index.
+
+[1] See WordFlags in [common/model](../../common/model/) and
+KeywordMetadata in [features-convert/keyword-extraction](../../features-convert/keyword-extraction).
## Central Classes
-* [ReverseIndexConverter](src/main/java/nu/marginalia/index/reverse/ReverseIndexConverter.java) constructs the index.
-* [ReverseIndexReader](src/main/java/nu/marginalia/index/reverse/ReverseIndexReader.java) interrogates the index.
\ No newline at end of file
+* [ReverseIndexFullConverter](src/main/java/nu/marginalia/index/full/ReverseIndexFullConverter.java) constructs the full index.
+* [ReverseIndexFullReader](src/main/java/nu/marginalia/index/full/ReverseIndexFullReader.java) interrogates the full index.
+* [ReverseIndexPriorityConverter](src/main/java/nu/marginalia/index/priority/ReverseIndexPriorityConverter.java) constructs the priority index.
+* [ReverseIndexPriorityReader](src/main/java/nu/marginalia/index/priority/ReverseIndexPriorityReader.java) interrogates the priority index.
\ No newline at end of file
diff --git a/code/features-index/index-reverse/src/main/java/nu/marginalia/index/construction/CountToOffsetTransformer.java b/code/features-index/index-reverse/src/main/java/nu/marginalia/index/construction/CountToOffsetTransformer.java
new file mode 100644
index 00000000..29f775f7
--- /dev/null
+++ b/code/features-index/index-reverse/src/main/java/nu/marginalia/index/construction/CountToOffsetTransformer.java
@@ -0,0 +1,25 @@
+package nu.marginalia.index.construction;
+
+import nu.marginalia.array.functional.LongTransformer;
+
+/**
+ * Transforms an array of item-counts into an array of item-offsets such that the previous counts would fit into an
+ * array indexed by the generated array.
+ *
+ * [ 1, 2, 3, 5, ... ] -> [ 0, 1, 3, 6, 11, ... ]
+ *
+ */
+public class CountToOffsetTransformer implements LongTransformer {
+ long offset = 0;
+
+ public final int entrySize;
+
+ public CountToOffsetTransformer(int entrySize) {
+ this.entrySize = entrySize;
+ }
+
+ @Override
+ public long transform(long pos, long count) {
+ return (offset += entrySize * count);
+ }
+}
diff --git a/code/features-index/index-reverse/src/main/java/nu/marginalia/index/construction/IndexSizeEstimator.java b/code/features-index/index-reverse/src/main/java/nu/marginalia/index/construction/IndexSizeEstimator.java
new file mode 100644
index 00000000..566b14b5
--- /dev/null
+++ b/code/features-index/index-reverse/src/main/java/nu/marginalia/index/construction/IndexSizeEstimator.java
@@ -0,0 +1,29 @@
+package nu.marginalia.index.construction;
+
+import nu.marginalia.array.functional.LongBinaryOperation;
+import nu.marginalia.btree.model.BTreeContext;
+
+/** Calculates the necessary size of an index from an array of offsets (@see CountToOffsetTransformer)
+ *
+ * Used with LongArray.fold()
+ * */
+public class IndexSizeEstimator implements LongBinaryOperation {
+ private final BTreeContext bTreeContext;
+ private final int entrySize;
+
+ public long size = 0;
+
+ public IndexSizeEstimator(BTreeContext bTreeContext, int entrySize) {
+ this.bTreeContext = bTreeContext;
+ this.entrySize = entrySize;
+ }
+
+ @Override
+ public long apply(long start, long end) {
+ if (end == start) return end;
+
+ size += bTreeContext.calculateSize((int) (end - start) / entrySize);
+
+ return end;
+ }
+}
diff --git a/code/features-index/index-reverse/src/main/java/nu/marginalia/index/construction/ReverseIndexBTreeTransformer.java b/code/features-index/index-reverse/src/main/java/nu/marginalia/index/construction/ReverseIndexBTreeTransformer.java
new file mode 100644
index 00000000..5e94921d
--- /dev/null
+++ b/code/features-index/index-reverse/src/main/java/nu/marginalia/index/construction/ReverseIndexBTreeTransformer.java
@@ -0,0 +1,49 @@
+package nu.marginalia.index.construction;
+
+import nu.marginalia.array.LongArray;
+import nu.marginalia.array.functional.LongIOTransformer;
+import nu.marginalia.btree.BTreeWriter;
+import nu.marginalia.btree.model.BTreeContext;
+import nu.marginalia.index.priority.ReverseIndexPriorityParameters;
+
+import java.io.IOException;
+import java.nio.channels.FileChannel;
+
+/** Constructs the BTrees in a reverse index */
+public class ReverseIndexBTreeTransformer implements LongIOTransformer {
+ private final BTreeWriter writer;
+ private final FileChannel intermediateChannel;
+
+ private final int entrySize;
+
+ long start = 0;
+ long writeOffset = 0;
+
+ public ReverseIndexBTreeTransformer(LongArray urlsFileMap,
+ int entrySize,
+ BTreeContext bTreeContext,
+ FileChannel intermediateChannel) {
+ this.writer = new BTreeWriter(urlsFileMap, bTreeContext);
+ this.entrySize = entrySize;
+ this.intermediateChannel = intermediateChannel;
+ }
+
+ @Override
+ public long transform(long pos, long end) throws IOException {
+
+ final int size = (int) (end - start) / entrySize;
+
+ if (size == 0) {
+ return -1;
+ }
+
+ final long offsetForBlock = writeOffset;
+
+ writeOffset += writer.write(writeOffset, size,
+ mapRegion -> mapRegion.transferFrom(intermediateChannel, start, 0, end - start)
+ );
+
+ start = end;
+ return offsetForBlock;
+ }
+}
diff --git a/code/features-index/index-reverse/src/main/java/nu/marginalia/index/reverse/ReverseIndexConverter.java b/code/features-index/index-reverse/src/main/java/nu/marginalia/index/full/ReverseIndexFullConverter.java
similarity index 71%
rename from code/features-index/index-reverse/src/main/java/nu/marginalia/index/reverse/ReverseIndexConverter.java
rename to code/features-index/index-reverse/src/main/java/nu/marginalia/index/full/ReverseIndexFullConverter.java
index 3ac9c449..339e1c39 100644
--- a/code/features-index/index-reverse/src/main/java/nu/marginalia/index/reverse/ReverseIndexConverter.java
+++ b/code/features-index/index-reverse/src/main/java/nu/marginalia/index/full/ReverseIndexFullConverter.java
@@ -1,6 +1,9 @@
-package nu.marginalia.index.reverse;
+package nu.marginalia.index.full;
import lombok.SneakyThrows;
+import nu.marginalia.index.construction.CountToOffsetTransformer;
+import nu.marginalia.index.construction.ReverseIndexBTreeTransformer;
+import nu.marginalia.index.construction.IndexSizeEstimator;
import nu.marginalia.index.journal.model.IndexJournalEntryData;
import nu.marginalia.index.journal.model.IndexJournalStatistics;
import nu.marginalia.index.journal.reader.IndexJournalReader;
@@ -9,10 +12,6 @@ import nu.marginalia.rwf.RandomWriteFunnel;
import nu.marginalia.array.IntArray;
import nu.marginalia.array.LongArray;
import nu.marginalia.array.algo.SortingContext;
-import nu.marginalia.array.functional.LongBinaryIOOperation;
-import nu.marginalia.array.functional.LongIOTransformer;
-import nu.marginalia.array.functional.LongTransformer;
-import nu.marginalia.btree.BTreeWriter;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
@@ -22,7 +21,9 @@ import java.nio.file.Files;
import java.nio.file.Path;
import java.nio.file.StandardOpenOption;
-public class ReverseIndexConverter {
+import static nu.marginalia.index.full.ReverseIndexFullParameters.bTreeContext;
+
+public class ReverseIndexFullConverter {
private static final int RWF_BIN_SIZE = 10_000_000;
private final Path tmpFileDir;
@@ -35,11 +36,11 @@ public class ReverseIndexConverter {
private final Path outputFileDocs;
private final SortingContext sortingContext;
- public ReverseIndexConverter(Path tmpFileDir,
- IndexJournalReader journalReader,
- DomainRankings domainRankings,
- Path outputFileWords,
- Path outputFileDocs) {
+ public ReverseIndexFullConverter(Path tmpFileDir,
+ IndexJournalReader journalReader,
+ DomainRankings domainRankings,
+ Path outputFileWords,
+ Path outputFileDocs) {
this.tmpFileDir = tmpFileDir;
this.journalReader = journalReader;
this.domainRankings = domainRankings;
@@ -70,7 +71,7 @@ public class ReverseIndexConverter {
logger.info("Gathering Offsets");
journalReader.forEachWordId(wordsOffsets::increment);
- wordsOffsets.transformEach(0, wordsFileSize, new CountToOffsetTransformer());
+ wordsOffsets.transformEach(0, wordsFileSize, new CountToOffsetTransformer(ReverseIndexFullParameters.ENTRY_SIZE));
// Construct an intermediate representation of the reverse documents index
try (FileChannel intermediateDocChannel =
@@ -95,7 +96,7 @@ public class ReverseIndexConverter {
{
LongArray intermediateDocs = LongArray.mmapForModifying(intermediateUrlsFile);
wordsOffsets.foldIO(0, 0, wordsFileSize, (s, e) -> {
- intermediateDocs.sortLargeSpanN(sortingContext, ReverseIndexParameters.ENTRY_SIZE, s, e);
+ intermediateDocs.sortLargeSpanN(sortingContext, ReverseIndexFullParameters.ENTRY_SIZE, s, e);
return e;
});
intermediateDocs.force();
@@ -104,14 +105,17 @@ public class ReverseIndexConverter {
logger.info("Sizing");
- SizeEstimator sizeEstimator = new SizeEstimator();
- wordsOffsets.foldIO(0, 0, wordsOffsets.size(), sizeEstimator);
+ IndexSizeEstimator sizeEstimator = new IndexSizeEstimator(
+ ReverseIndexFullParameters.bTreeContext,
+ ReverseIndexFullParameters.ENTRY_SIZE);
+
+ wordsOffsets.fold(0, 0, wordsOffsets.size(), sizeEstimator);
logger.info("Finalizing Docs File");
LongArray finalDocs = LongArray.mmapForWriting(outputFileDocs, sizeEstimator.size);
// Construct the proper reverse index
- wordsOffsets.transformEachIO(0, wordsOffsets.size(), new CreateReverseIndexBTreeTransformer(finalDocs, intermediateDocChannel));
+ wordsOffsets.transformEachIO(0, wordsOffsets.size(), new ReverseIndexBTreeTransformer(finalDocs, ReverseIndexFullParameters.ENTRY_SIZE, bTreeContext, intermediateDocChannel));
wordsOffsets.write(outputFileWords);
// Attempt to clean up before forcing (important disk space preservation)
@@ -130,66 +134,11 @@ public class ReverseIndexConverter {
}
}
- private static class SizeEstimator implements LongBinaryIOOperation {
- public long size = 0;
- @Override
- public long apply(long start, long end) {
- if (end == start) return end;
-
- size += ReverseIndexParameters.bTreeContext.calculateSize((int) (end - start) / ReverseIndexParameters.ENTRY_SIZE);
-
- return end;
- }
- }
-
private void deleteOldFiles() throws IOException {
Files.deleteIfExists(outputFileWords);
Files.deleteIfExists(outputFileDocs);
}
- private static class CountToOffsetTransformer implements LongTransformer {
- long offset = 0;
-
- @Override
- public long transform(long pos, long count) {
- return (offset += ReverseIndexParameters.ENTRY_SIZE * count);
- }
- }
-
- private static class CreateReverseIndexBTreeTransformer implements LongIOTransformer {
- private final BTreeWriter writer;
- private final FileChannel intermediateChannel;
-
- long start = 0;
- long writeOffset = 0;
-
- public CreateReverseIndexBTreeTransformer(LongArray urlsFileMap, FileChannel intermediateChannel) {
- this.writer = new BTreeWriter(urlsFileMap, ReverseIndexParameters.bTreeContext);
- this.intermediateChannel = intermediateChannel;
- }
-
- @Override
- public long transform(long pos, long end) throws IOException {
-
- assert (end - start) % ReverseIndexParameters.ENTRY_SIZE == 0;
-
- final int size = (int)(end - start) / ReverseIndexParameters.ENTRY_SIZE;
-
- if (size == 0) {
- return -1;
- }
-
- final long offsetForBlock = writeOffset;
-
- writeOffset += writer.write(writeOffset, size,
- mapRegion -> mapRegion.transferFrom(intermediateChannel, start, 0, end - start)
- );
-
- start = end;
- return offsetForBlock;
- }
- }
-
private class IntermediateIndexConstructor implements IndexJournalReader.LongObjectConsumer, AutoCloseable {
private final LongArray wordRangeEnds;
diff --git a/code/features-index/index-reverse/src/main/java/nu/marginalia/index/reverse/query/ReverseIndexEntrySource.java b/code/features-index/index-reverse/src/main/java/nu/marginalia/index/full/ReverseIndexFullEntrySource.java
similarity index 64%
rename from code/features-index/index-reverse/src/main/java/nu/marginalia/index/reverse/query/ReverseIndexEntrySource.java
rename to code/features-index/index-reverse/src/main/java/nu/marginalia/index/full/ReverseIndexFullEntrySource.java
index 13cdc3af..7ea6bfe8 100644
--- a/code/features-index/index-reverse/src/main/java/nu/marginalia/index/reverse/query/ReverseIndexEntrySource.java
+++ b/code/features-index/index-reverse/src/main/java/nu/marginalia/index/full/ReverseIndexFullEntrySource.java
@@ -1,27 +1,30 @@
-package nu.marginalia.index.reverse.query;
+package nu.marginalia.index.full;
import nu.marginalia.array.buffer.LongQueryBuffer;
import nu.marginalia.btree.BTreeReader;
import nu.marginalia.index.query.EntrySource;
+import nu.marginalia.index.query.ReverseIndexEntrySourceBehavior;
import static java.lang.Math.min;
-public class ReverseIndexEntrySource implements EntrySource {
+public class ReverseIndexFullEntrySource implements EntrySource {
private final BTreeReader reader;
- private static final int ENTRY_SIZE = 2;
-
int pos;
int endOffset;
+ final int entrySize;
private final ReverseIndexEntrySourceBehavior behavior;
- public ReverseIndexEntrySource(BTreeReader reader, ReverseIndexEntrySourceBehavior behavior) {
+ public ReverseIndexFullEntrySource(BTreeReader reader,
+ int entrySize,
+ ReverseIndexEntrySourceBehavior behavior) {
this.reader = reader;
this.behavior = behavior;
+ this.entrySize = entrySize;
pos = 0;
- endOffset = pos + ENTRY_SIZE*reader.numEntries();
+ endOffset = pos + entrySize * reader.numEntries();
}
@Override
@@ -39,9 +42,7 @@ public class ReverseIndexEntrySource implements EntrySource {
}
buffer.end = min(buffer.end, endOffset - pos);
-
reader.readData(buffer.data, buffer.end, pos);
-
pos += buffer.end;
destagger(buffer);
@@ -49,14 +50,14 @@ public class ReverseIndexEntrySource implements EntrySource {
}
private void destagger(LongQueryBuffer buffer) {
- if (ENTRY_SIZE == 1)
+ if (entrySize == 1)
return;
- for (int ri = ENTRY_SIZE, wi=1; ri < buffer.end ; ri+=ENTRY_SIZE, wi++) {
+ for (int ri = entrySize, wi=1; ri < buffer.end ; ri+=entrySize, wi++) {
buffer.data[wi] = buffer.data[ri];
}
- buffer.end /= ENTRY_SIZE;
+ buffer.end /= entrySize;
}
@Override
@@ -64,9 +65,4 @@ public class ReverseIndexEntrySource implements EntrySource {
return pos < endOffset;
}
- @Override
- public String toString() {
- return String.format("BTreeRange.EntrySource(@" + pos + ": " + endOffset + ")");
- }
-
}
diff --git a/code/features-index/index-reverse/src/main/java/nu/marginalia/index/reverse/ReverseIndexParameters.java b/code/features-index/index-reverse/src/main/java/nu/marginalia/index/full/ReverseIndexFullParameters.java
similarity index 51%
rename from code/features-index/index-reverse/src/main/java/nu/marginalia/index/reverse/ReverseIndexParameters.java
rename to code/features-index/index-reverse/src/main/java/nu/marginalia/index/full/ReverseIndexFullParameters.java
index c1caaa0c..fb767cb2 100644
--- a/code/features-index/index-reverse/src/main/java/nu/marginalia/index/reverse/ReverseIndexParameters.java
+++ b/code/features-index/index-reverse/src/main/java/nu/marginalia/index/full/ReverseIndexFullParameters.java
@@ -1,16 +1,16 @@
-package nu.marginalia.index.reverse;
+package nu.marginalia.index.full;
import nu.marginalia.btree.model.BTreeBlockSize;
import nu.marginalia.btree.model.BTreeContext;
-class ReverseIndexParameters {
- public static final int ENTRY_SIZE = 2;
+public class ReverseIndexFullParameters {
+ static final int ENTRY_SIZE = 2;
// This is the byte size per index page on disk, the data pages are twice as large due to ENTRY_SIZE = 2.
//
// Given a hardware limit of 4k reads, 2k block size should be optimal.
- public static final BTreeBlockSize blockSize = BTreeBlockSize.BS_2048;
+ static final BTreeBlockSize blockSize = BTreeBlockSize.BS_2048;
- public static final BTreeContext bTreeContext = new BTreeContext(5, ENTRY_SIZE, blockSize);
+ static final BTreeContext bTreeContext = new BTreeContext(5, ENTRY_SIZE, blockSize);
}
diff --git a/code/features-index/index-reverse/src/main/java/nu/marginalia/index/reverse/ReverseIndexReader.java b/code/features-index/index-reverse/src/main/java/nu/marginalia/index/full/ReverseIndexFullReader.java
similarity index 83%
rename from code/features-index/index-reverse/src/main/java/nu/marginalia/index/reverse/ReverseIndexReader.java
rename to code/features-index/index-reverse/src/main/java/nu/marginalia/index/full/ReverseIndexFullReader.java
index 7c078e9c..32f63c7d 100644
--- a/code/features-index/index-reverse/src/main/java/nu/marginalia/index/reverse/ReverseIndexReader.java
+++ b/code/features-index/index-reverse/src/main/java/nu/marginalia/index/full/ReverseIndexFullReader.java
@@ -1,9 +1,8 @@
-package nu.marginalia.index.reverse;
+package nu.marginalia.index.full;
-import nu.marginalia.index.reverse.query.ReverseIndexEntrySourceBehavior;
-import nu.marginalia.index.reverse.query.ReverseIndexEntrySource;
-import nu.marginalia.index.reverse.query.ReverseIndexRejectFilter;
-import nu.marginalia.index.reverse.query.ReverseIndexRetainFilter;
+import nu.marginalia.index.query.ReverseIndexEntrySourceBehavior;
+import nu.marginalia.index.query.ReverseIndexRejectFilter;
+import nu.marginalia.index.query.ReverseIndexRetainFilter;
import nu.marginalia.array.LongArray;
import nu.marginalia.btree.BTreeReader;
import nu.marginalia.index.query.EmptyEntrySource;
@@ -19,13 +18,13 @@ import java.nio.file.Files;
import java.nio.file.Path;
import java.util.Arrays;
-public class ReverseIndexReader {
+public class ReverseIndexFullReader {
private final LongArray words;
private final LongArray documents;
private final Logger logger = LoggerFactory.getLogger(getClass());
- public ReverseIndexReader(Path words, Path documents) throws IOException {
+ public ReverseIndexFullReader(Path words, Path documents) throws IOException {
if (!Files.exists(words) || !Files.exists(documents)) {
this.words = null;
this.documents = null;
@@ -64,7 +63,7 @@ public class ReverseIndexReader {
if (offset < 0) return new EmptyEntrySource();
- return new ReverseIndexEntrySource(createReaderNew(offset), behavior);
+ return new ReverseIndexFullEntrySource(createReaderNew(offset), ReverseIndexFullParameters.ENTRY_SIZE, behavior);
}
public QueryFilterStepIf also(int wordId) {
@@ -100,7 +99,7 @@ public class ReverseIndexReader {
}
private BTreeReader createReaderNew(long offset) {
- return new BTreeReader(documents, ReverseIndexParameters.bTreeContext, offset);
+ return new BTreeReader(documents, ReverseIndexFullParameters.bTreeContext, offset);
}
public long[] getTermMeta(int wordId, long[] docIds) {
diff --git a/code/features-index/index-reverse/src/main/java/nu/marginalia/index/priority/ReverseIndexPriorityConverter.java b/code/features-index/index-reverse/src/main/java/nu/marginalia/index/priority/ReverseIndexPriorityConverter.java
new file mode 100644
index 00000000..fbd49405
--- /dev/null
+++ b/code/features-index/index-reverse/src/main/java/nu/marginalia/index/priority/ReverseIndexPriorityConverter.java
@@ -0,0 +1,197 @@
+package nu.marginalia.index.priority;
+
+import lombok.SneakyThrows;
+import nu.marginalia.array.IntArray;
+import nu.marginalia.array.LongArray;
+import nu.marginalia.array.algo.SortingContext;
+import nu.marginalia.index.construction.CountToOffsetTransformer;
+import nu.marginalia.index.construction.ReverseIndexBTreeTransformer;
+import nu.marginalia.index.construction.IndexSizeEstimator;
+import nu.marginalia.index.journal.model.IndexJournalEntryData;
+import nu.marginalia.index.journal.model.IndexJournalStatistics;
+import nu.marginalia.index.journal.reader.IndexJournalReader;
+import nu.marginalia.ranking.DomainRankings;
+import nu.marginalia.rwf.RandomWriteFunnel;
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+
+import java.io.IOException;
+import java.nio.channels.FileChannel;
+import java.nio.file.Files;
+import java.nio.file.Path;
+import java.nio.file.StandardOpenOption;
+
+public class ReverseIndexPriorityConverter {
+ private static final int RWF_BIN_SIZE = 10_000_000;
+
+ private final Path tmpFileDir;
+
+ private final Logger logger = LoggerFactory.getLogger(getClass());
+
+ private final IndexJournalReader journalReader;
+ private final DomainRankings domainRankings;
+ private final Path outputFileWords;
+ private final Path outputFileDocs;
+ private final SortingContext sortingContext;
+
+ public ReverseIndexPriorityConverter(Path tmpFileDir,
+ IndexJournalReader journalReader,
+ DomainRankings domainRankings,
+ Path outputFileWords,
+ Path outputFileDocs) {
+ this.tmpFileDir = tmpFileDir;
+ this.journalReader = journalReader;
+ this.domainRankings = domainRankings;
+ this.outputFileWords = outputFileWords;
+ this.outputFileDocs = outputFileDocs;
+ this.sortingContext = new SortingContext(tmpFileDir, 64_000);
+ }
+
+ public void convert() throws IOException {
+ deleteOldFiles();
+
+ if (journalReader.fileHeader().fileSize() <= IndexJournalReader.FILE_HEADER_SIZE_BYTES) {
+ logger.warn("Bailing: Journal is empty!");
+ return;
+ }
+
+ final IndexJournalStatistics statistics = journalReader.getStatistics();
+
+ final Path intermediateUrlsFile = Files.createTempFile(tmpFileDir, "urls-sorted", ".dat");
+
+
+ try {
+ final long wordsFileSize = statistics.highestWord() + 1;
+
+ logger.debug("Words file size: {}", wordsFileSize);
+ // Create a count of how many documents has contains each word
+ final LongArray wordsOffsets = LongArray.allocate(wordsFileSize);
+
+ logger.info("Gathering Offsets");
+ journalReader.forEachWordId(wordsOffsets::increment);
+ wordsOffsets.transformEach(0, wordsFileSize, new CountToOffsetTransformer(ReverseIndexPriorityParameters.ENTRY_SIZE));
+
+ // Construct an intermediate representation of the reverse documents index
+ try (FileChannel intermediateDocChannel =
+ (FileChannel) Files.newByteChannel(intermediateUrlsFile,
+ StandardOpenOption.CREATE, StandardOpenOption.READ, StandardOpenOption.WRITE))
+ {
+ logger.info("Creating Intermediate Docs File");
+
+ // Construct intermediate index
+ try (RandomWriteFunnel intermediateDocumentWriteFunnel = new RandomWriteFunnel(tmpFileDir, RWF_BIN_SIZE);
+ IntermediateIndexConstructor intermediateIndexConstructor = new IntermediateIndexConstructor(tmpFileDir, wordsOffsets, intermediateDocumentWriteFunnel)
+ )
+ {
+ journalReader.forEachDocIdRecord(intermediateIndexConstructor);
+ intermediateDocumentWriteFunnel.write(intermediateDocChannel);
+ }
+ intermediateDocChannel.force(false);
+
+ logger.info("Sorting Intermediate Docs File");
+
+ // Sort each segment of the intermediate file
+ {
+ LongArray intermediateDocs = LongArray.mmapForModifying(intermediateUrlsFile);
+ wordsOffsets.foldIO(0, 0, wordsFileSize, (s, e) -> {
+ intermediateDocs.sortLargeSpan(sortingContext, s, e);
+ return e;
+ });
+ intermediateDocs.force();
+ }
+
+
+ logger.info("Sizing");
+
+ IndexSizeEstimator indexSizeEstimator = new IndexSizeEstimator(
+ ReverseIndexPriorityParameters.bTreeContext,
+ ReverseIndexPriorityParameters.ENTRY_SIZE);
+
+ wordsOffsets.fold(0, 0, wordsOffsets.size(), indexSizeEstimator);
+
+ logger.info("Finalizing Docs File");
+
+ LongArray finalDocs = LongArray.mmapForWriting(outputFileDocs, indexSizeEstimator.size);
+ // Construct the proper reverse index
+ wordsOffsets.transformEachIO(0, wordsOffsets.size(),
+ new ReverseIndexBTreeTransformer(finalDocs,
+ ReverseIndexPriorityParameters.ENTRY_SIZE,
+ ReverseIndexPriorityParameters.bTreeContext,
+ intermediateDocChannel));
+ wordsOffsets.write(outputFileWords);
+
+ // Attempt to clean up before forcing (important disk space preservation)
+ Files.deleteIfExists(intermediateUrlsFile);
+
+ wordsOffsets.force();
+ finalDocs.force();
+ logger.info("Done");
+ }
+
+ } catch (IOException ex) {
+ logger.error("Failed to convert", ex);
+ throw ex;
+ } finally {
+ Files.deleteIfExists(intermediateUrlsFile);
+ }
+ }
+
+ private void deleteOldFiles() throws IOException {
+ Files.deleteIfExists(outputFileWords);
+ Files.deleteIfExists(outputFileDocs);
+ }
+
+ private class IntermediateIndexConstructor implements IndexJournalReader.LongObjectConsumer, AutoCloseable {
+
+ private final LongArray wordRangeEnds;
+ private final IntArray wordRangeOffset;
+ private final RandomWriteFunnel documentsFile;
+
+ private final Path tempFile;
+
+ public IntermediateIndexConstructor(Path tempDir, LongArray wordRangeEnds, RandomWriteFunnel documentsFile) throws IOException {
+ tempFile = Files.createTempFile(tempDir, "iic", "dat");
+
+ this.wordRangeEnds = wordRangeEnds;
+ this.wordRangeOffset = IntArray.mmapForWriting(tempFile, wordRangeEnds.size());
+ this.documentsFile = documentsFile;
+ }
+
+ @SneakyThrows
+ @Override
+ public void accept(long docId, IndexJournalEntryData.Record record) {
+
+ /* Encode the ID as
+ *
+ * 32 bits 32 bits
+ * [ ranking | url-id ]
+ *
+ * in order to get low-ranking documents to be considered first
+ * when sorting the items.
+ */
+
+ int domainId = (int) (docId >>> 32);
+ long rankingId = (long) domainRankings.getRanking(domainId) << 32;
+
+ int urlId = (int) (docId & 0xFFFF_FFFFL);
+ long rankEncodedId = rankingId | urlId;
+
+ final int wordId = record.wordId();
+ long offset = startOfRange(wordId);
+
+ documentsFile.put(offset + wordRangeOffset.getAndIncrement(wordId), rankEncodedId);
+ }
+
+ private long startOfRange(int wordId) {
+ if (wordId == 0) return 0;
+
+ return wordRangeEnds.get(wordId - 1);
+ }
+
+ public void close() throws IOException {
+ Files.delete(tempFile);
+ }
+ }
+
+}
+
diff --git a/code/features-index/index-reverse/src/main/java/nu/marginalia/index/priority/ReverseIndexPriorityEntrySource.java b/code/features-index/index-reverse/src/main/java/nu/marginalia/index/priority/ReverseIndexPriorityEntrySource.java
new file mode 100644
index 00000000..7c530d66
--- /dev/null
+++ b/code/features-index/index-reverse/src/main/java/nu/marginalia/index/priority/ReverseIndexPriorityEntrySource.java
@@ -0,0 +1,52 @@
+package nu.marginalia.index.priority;
+
+import nu.marginalia.array.buffer.LongQueryBuffer;
+import nu.marginalia.btree.BTreeReader;
+import nu.marginalia.index.query.EntrySource;
+import nu.marginalia.index.query.ReverseIndexEntrySourceBehavior;
+
+import static java.lang.Math.min;
+
+public class ReverseIndexPriorityEntrySource implements EntrySource {
+ private final BTreeReader reader;
+
+ int pos;
+ int endOffset;
+
+ private final ReverseIndexEntrySourceBehavior behavior;
+
+ public ReverseIndexPriorityEntrySource(BTreeReader reader, ReverseIndexEntrySourceBehavior behavior) {
+ this.reader = reader;
+ this.behavior = behavior;
+
+ pos = 0;
+ endOffset = pos + reader.numEntries();
+ }
+
+ @Override
+ public void skip(int n) {
+ pos += n;
+ }
+
+ @Override
+ public void read(LongQueryBuffer buffer) {
+ if (behavior == ReverseIndexEntrySourceBehavior.DO_NOT_PREFER
+ && buffer.hasRetainedData())
+ {
+ pos = endOffset;
+ return;
+ }
+
+ buffer.end = min(buffer.end, endOffset - pos);
+ reader.readData(buffer.data, buffer.end, pos);
+ pos += buffer.end;
+
+ buffer.uniq();
+ }
+
+ @Override
+ public boolean hasMore() {
+ return pos < endOffset;
+ }
+
+}
diff --git a/code/features-index/index-reverse/src/main/java/nu/marginalia/index/reverse/ReverseIndexPriorityParameters.java b/code/features-index/index-reverse/src/main/java/nu/marginalia/index/priority/ReverseIndexPriorityParameters.java
similarity index 68%
rename from code/features-index/index-reverse/src/main/java/nu/marginalia/index/reverse/ReverseIndexPriorityParameters.java
rename to code/features-index/index-reverse/src/main/java/nu/marginalia/index/priority/ReverseIndexPriorityParameters.java
index dba81461..5cd09307 100644
--- a/code/features-index/index-reverse/src/main/java/nu/marginalia/index/reverse/ReverseIndexPriorityParameters.java
+++ b/code/features-index/index-reverse/src/main/java/nu/marginalia/index/priority/ReverseIndexPriorityParameters.java
@@ -1,9 +1,16 @@
-package nu.marginalia.index.reverse;
+package nu.marginalia.index.priority;
+import nu.marginalia.btree.model.BTreeBlockSize;
+import nu.marginalia.btree.model.BTreeContext;
import nu.marginalia.index.journal.model.IndexJournalEntryData;
import nu.marginalia.model.idx.WordFlags;
public class ReverseIndexPriorityParameters {
+ static final int ENTRY_SIZE = 1;
+ static final BTreeBlockSize blockSize = BTreeBlockSize.BS_4096;
+
+ static final BTreeContext bTreeContext = new BTreeContext(5, ENTRY_SIZE, blockSize);
+
private static final long highPriorityFlags =
WordFlags.Title.asBit()
| WordFlags.Subjects.asBit()
diff --git a/code/features-index/index-reverse/src/main/java/nu/marginalia/index/reverse/ReverseIndexPrioReader.java b/code/features-index/index-reverse/src/main/java/nu/marginalia/index/priority/ReverseIndexPriorityReader.java
similarity index 70%
rename from code/features-index/index-reverse/src/main/java/nu/marginalia/index/reverse/ReverseIndexPrioReader.java
rename to code/features-index/index-reverse/src/main/java/nu/marginalia/index/priority/ReverseIndexPriorityReader.java
index 2a38a9dd..b4b5d75a 100644
--- a/code/features-index/index-reverse/src/main/java/nu/marginalia/index/reverse/ReverseIndexPrioReader.java
+++ b/code/features-index/index-reverse/src/main/java/nu/marginalia/index/priority/ReverseIndexPriorityReader.java
@@ -1,7 +1,6 @@
-package nu.marginalia.index.reverse;
+package nu.marginalia.index.priority;
-import nu.marginalia.index.reverse.query.ReverseIndexEntrySourceBehavior;
-import nu.marginalia.index.reverse.query.ReverseIndexEntrySource;
+import nu.marginalia.index.query.ReverseIndexEntrySourceBehavior;
import nu.marginalia.index.query.EntrySource;
import nu.marginalia.array.LongArray;
import nu.marginalia.btree.BTreeReader;
@@ -13,13 +12,13 @@ import java.io.IOException;
import java.nio.file.Files;
import java.nio.file.Path;
-public class ReverseIndexPrioReader {
+public class ReverseIndexPriorityReader {
private final LongArray words;
private final LongArray documents;
private final Logger logger = LoggerFactory.getLogger(getClass());
- public ReverseIndexPrioReader(Path words, Path documents) throws IOException {
+ public ReverseIndexPriorityReader(Path words, Path documents) throws IOException {
if (!Files.exists(words) || !Files.exists(documents)) {
this.words = null;
this.documents = null;
@@ -44,10 +43,10 @@ public class ReverseIndexPrioReader {
if (offset < 0) return new EmptyEntrySource();
- return new ReverseIndexEntrySource(createReaderNew(offset), ReverseIndexEntrySourceBehavior.DO_PREFER);
+ return new ReverseIndexPriorityEntrySource(createReaderNew(offset), ReverseIndexEntrySourceBehavior.DO_PREFER);
}
private BTreeReader createReaderNew(long offset) {
- return new BTreeReader(documents, ReverseIndexParameters.bTreeContext, offset);
+ return new BTreeReader(documents, ReverseIndexPriorityParameters.bTreeContext, offset);
}
}
diff --git a/code/features-index/index-reverse/src/main/java/nu/marginalia/index/reverse/query/ReverseIndexEntrySourceBehavior.java b/code/features-index/index-reverse/src/main/java/nu/marginalia/index/query/ReverseIndexEntrySourceBehavior.java
similarity index 84%
rename from code/features-index/index-reverse/src/main/java/nu/marginalia/index/reverse/query/ReverseIndexEntrySourceBehavior.java
rename to code/features-index/index-reverse/src/main/java/nu/marginalia/index/query/ReverseIndexEntrySourceBehavior.java
index 67058fed..4b84bda7 100644
--- a/code/features-index/index-reverse/src/main/java/nu/marginalia/index/reverse/query/ReverseIndexEntrySourceBehavior.java
+++ b/code/features-index/index-reverse/src/main/java/nu/marginalia/index/query/ReverseIndexEntrySourceBehavior.java
@@ -1,4 +1,4 @@
-package nu.marginalia.index.reverse.query;
+package nu.marginalia.index.query;
public enum ReverseIndexEntrySourceBehavior {
/** Eagerly read from this entry source */
diff --git a/code/features-index/index-reverse/src/main/java/nu/marginalia/index/reverse/query/ReverseIndexRejectFilter.java b/code/features-index/index-reverse/src/main/java/nu/marginalia/index/query/ReverseIndexRejectFilter.java
similarity index 93%
rename from code/features-index/index-reverse/src/main/java/nu/marginalia/index/reverse/query/ReverseIndexRejectFilter.java
rename to code/features-index/index-reverse/src/main/java/nu/marginalia/index/query/ReverseIndexRejectFilter.java
index 0ad4112f..c10a7845 100644
--- a/code/features-index/index-reverse/src/main/java/nu/marginalia/index/reverse/query/ReverseIndexRejectFilter.java
+++ b/code/features-index/index-reverse/src/main/java/nu/marginalia/index/query/ReverseIndexRejectFilter.java
@@ -1,4 +1,4 @@
-package nu.marginalia.index.reverse.query;
+package nu.marginalia.index.query;
import nu.marginalia.array.buffer.LongQueryBuffer;
import nu.marginalia.btree.BTreeReader;
diff --git a/code/features-index/index-reverse/src/main/java/nu/marginalia/index/reverse/query/ReverseIndexRetainFilter.java b/code/features-index/index-reverse/src/main/java/nu/marginalia/index/query/ReverseIndexRetainFilter.java
similarity index 93%
rename from code/features-index/index-reverse/src/main/java/nu/marginalia/index/reverse/query/ReverseIndexRetainFilter.java
rename to code/features-index/index-reverse/src/main/java/nu/marginalia/index/query/ReverseIndexRetainFilter.java
index a9a14dad..33f23c71 100644
--- a/code/features-index/index-reverse/src/main/java/nu/marginalia/index/reverse/query/ReverseIndexRetainFilter.java
+++ b/code/features-index/index-reverse/src/main/java/nu/marginalia/index/query/ReverseIndexRetainFilter.java
@@ -1,4 +1,4 @@
-package nu.marginalia.index.reverse.query;
+package nu.marginalia.index.query;
import nu.marginalia.array.buffer.LongQueryBuffer;
import nu.marginalia.btree.BTreeReader;
diff --git a/code/features-index/index-reverse/src/test/java/nu/marginalia/index/reverse/ReverseIndexConverterTest.java b/code/features-index/index-reverse/src/test/java/nu/marginalia/index/reverse/ReverseIndexFullConverterTest.java
similarity index 92%
rename from code/features-index/index-reverse/src/test/java/nu/marginalia/index/reverse/ReverseIndexConverterTest.java
rename to code/features-index/index-reverse/src/test/java/nu/marginalia/index/reverse/ReverseIndexFullConverterTest.java
index 67977ffa..c8624196 100644
--- a/code/features-index/index-reverse/src/test/java/nu/marginalia/index/reverse/ReverseIndexConverterTest.java
+++ b/code/features-index/index-reverse/src/test/java/nu/marginalia/index/reverse/ReverseIndexFullConverterTest.java
@@ -2,11 +2,13 @@ package nu.marginalia.index.reverse;
import lombok.SneakyThrows;
import nu.marginalia.array.buffer.LongQueryBuffer;
+import nu.marginalia.index.full.ReverseIndexFullConverter;
+import nu.marginalia.index.full.ReverseIndexFullReader;
import nu.marginalia.index.journal.model.IndexJournalEntry;
import nu.marginalia.index.journal.reader.IndexJournalReaderSingleCompressedFile;
import nu.marginalia.index.journal.writer.IndexJournalWriterImpl;
import nu.marginalia.index.journal.writer.IndexJournalWriter;
-import nu.marginalia.index.reverse.query.ReverseIndexEntrySourceBehavior;
+import nu.marginalia.index.query.ReverseIndexEntrySourceBehavior;
import nu.marginalia.ranking.DomainRankings;
import nu.marginalia.lexicon.KeywordLexicon;
import nu.marginalia.lexicon.journal.KeywordLexiconJournal;
@@ -25,7 +27,7 @@ import java.util.stream.LongStream;
import static org.junit.jupiter.api.Assertions.assertArrayEquals;
-class ReverseIndexConverterTest {
+class ReverseIndexFullConverterTest {
KeywordLexicon keywordLexicon;
Path indexFile;
@@ -82,10 +84,10 @@ class ReverseIndexConverterTest {
var docsFile = dataDir.resolve("docs.dat");
var journalReader = new IndexJournalReaderSingleCompressedFile(indexFile);
- new ReverseIndexConverter(tmpDir, journalReader, new DomainRankings(), wordsFile, docsFile)
+ new ReverseIndexFullConverter(tmpDir, journalReader, new DomainRankings(), wordsFile, docsFile)
.convert();
- var reverseIndexReader = new ReverseIndexReader(wordsFile, docsFile);
+ var reverseIndexReader = new ReverseIndexFullReader(wordsFile, docsFile);
System.out.println(reverseIndexReader.numDocuments(keywordLexicon.getReadOnly("1")));
System.out.println(reverseIndexReader.numDocuments(keywordLexicon.getReadOnly("2")));
diff --git a/code/features-index/index-reverse/src/test/java/nu/marginalia/index/reverse/ReverseIndexConverterTest2.java b/code/features-index/index-reverse/src/test/java/nu/marginalia/index/reverse/ReverseIndexFullConverterTest2.java
similarity index 85%
rename from code/features-index/index-reverse/src/test/java/nu/marginalia/index/reverse/ReverseIndexConverterTest2.java
rename to code/features-index/index-reverse/src/test/java/nu/marginalia/index/reverse/ReverseIndexFullConverterTest2.java
index d07d333c..fd6bb1d8 100644
--- a/code/features-index/index-reverse/src/test/java/nu/marginalia/index/reverse/ReverseIndexConverterTest2.java
+++ b/code/features-index/index-reverse/src/test/java/nu/marginalia/index/reverse/ReverseIndexFullConverterTest2.java
@@ -2,12 +2,15 @@ package nu.marginalia.index.reverse;
import lombok.SneakyThrows;
import nu.marginalia.array.buffer.LongQueryBuffer;
+import nu.marginalia.index.full.ReverseIndexFullConverter;
+import nu.marginalia.index.full.ReverseIndexFullReader;
import nu.marginalia.index.journal.model.IndexJournalEntryData;
import nu.marginalia.index.journal.model.IndexJournalEntryHeader;
import nu.marginalia.index.journal.reader.IndexJournalReaderSingleCompressedFile;
import nu.marginalia.index.journal.writer.IndexJournalWriterImpl;
import nu.marginalia.index.journal.writer.IndexJournalWriter;
-import nu.marginalia.index.reverse.query.ReverseIndexEntrySourceBehavior;
+import nu.marginalia.index.priority.ReverseIndexPriorityParameters;
+import nu.marginalia.index.query.ReverseIndexEntrySourceBehavior;
import nu.marginalia.ranking.DomainRankings;
import nu.marginalia.lexicon.KeywordLexicon;
import nu.marginalia.lexicon.journal.KeywordLexiconJournal;
@@ -25,7 +28,7 @@ import java.util.Arrays;
import java.util.stream.IntStream;
import java.util.stream.LongStream;
-class ReverseIndexConverterTest2 {
+class ReverseIndexFullConverterTest2 {
KeywordLexicon keywordLexicon;
IndexJournalWriter writer;
@@ -114,9 +117,9 @@ class ReverseIndexConverterTest2 {
Path tmpDir = Path.of("/tmp");
- new ReverseIndexConverter(tmpDir, new IndexJournalReaderSingleCompressedFile(indexFile), new DomainRankings(), wordsFile, docsFile).convert();
+ new ReverseIndexFullConverter(tmpDir, new IndexJournalReaderSingleCompressedFile(indexFile), new DomainRankings(), wordsFile, docsFile).convert();
- var reverseReader = new ReverseIndexReader(wordsFile, docsFile);
+ var reverseReader = new ReverseIndexFullReader(wordsFile, docsFile);
for (int i = workSetStart; i < workSetSize; i++) {
@@ -139,9 +142,9 @@ class ReverseIndexConverterTest2 {
Path tmpDir = Path.of("/tmp");
- new ReverseIndexConverter(tmpDir, new IndexJournalReaderSingleCompressedFile(indexFile, null, ReverseIndexPriorityParameters::filterPriorityRecord), new DomainRankings(), wordsFile, docsFile).convert();
+ new ReverseIndexFullConverter(tmpDir, new IndexJournalReaderSingleCompressedFile(indexFile, null, ReverseIndexPriorityParameters::filterPriorityRecord), new DomainRankings(), wordsFile, docsFile).convert();
- var reverseReader = new ReverseIndexReader(wordsFile, docsFile);
+ var reverseReader = new ReverseIndexFullReader(wordsFile, docsFile);
for (int i = workSetStart; i < workSetSize; i++) {
diff --git a/code/features-index/index-reverse/src/test/java/nu/marginalia/index/reverse/ReverseIndexPriorityConverterTest2.java b/code/features-index/index-reverse/src/test/java/nu/marginalia/index/reverse/ReverseIndexPriorityConverterTest2.java
new file mode 100644
index 00000000..21d6198b
--- /dev/null
+++ b/code/features-index/index-reverse/src/test/java/nu/marginalia/index/reverse/ReverseIndexPriorityConverterTest2.java
@@ -0,0 +1,163 @@
+package nu.marginalia.index.reverse;
+
+import lombok.SneakyThrows;
+import nu.marginalia.array.buffer.LongQueryBuffer;
+import nu.marginalia.index.journal.model.IndexJournalEntryData;
+import nu.marginalia.index.journal.model.IndexJournalEntryHeader;
+import nu.marginalia.index.journal.reader.IndexJournalReaderSingleCompressedFile;
+import nu.marginalia.index.journal.writer.IndexJournalWriter;
+import nu.marginalia.index.journal.writer.IndexJournalWriterImpl;
+import nu.marginalia.index.priority.ReverseIndexPriorityReader;
+import nu.marginalia.index.priority.ReverseIndexPriorityConverter;
+import nu.marginalia.index.priority.ReverseIndexPriorityParameters;
+import nu.marginalia.lexicon.KeywordLexicon;
+import nu.marginalia.lexicon.journal.KeywordLexiconJournal;
+import nu.marginalia.ranking.DomainRankings;
+import nu.marginalia.test.TestUtil;
+import org.junit.jupiter.api.AfterEach;
+import org.junit.jupiter.api.BeforeEach;
+import org.junit.jupiter.api.Test;
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+
+import java.io.IOException;
+import java.nio.file.Files;
+import java.nio.file.Path;
+import java.util.Arrays;
+import java.util.stream.IntStream;
+import java.util.stream.LongStream;
+
+class ReverseIndexPriorityConverterTest2 {
+
+ KeywordLexicon keywordLexicon;
+ IndexJournalWriter writer;
+
+ Path indexFile;
+ Path wordsFile1;
+ Path urlsFile1;
+ Path dictionaryFile;
+
+ private final Logger logger = LoggerFactory.getLogger(getClass());
+
+ Path dataDir;
+ private Path wordsFile;
+ private Path docsFile;
+
+ int workSetSize = 8192;
+ int workSetStart = 8000;
+
+ @BeforeEach
+ @SneakyThrows
+ void setUp() {
+ dictionaryFile = Files.createTempFile("tmp", ".dict");
+ dictionaryFile.toFile().deleteOnExit();
+
+ keywordLexicon = new KeywordLexicon(new KeywordLexiconJournal(dictionaryFile.toFile()));
+ keywordLexicon.getOrInsert("0");
+
+ indexFile = Files.createTempFile("tmp", ".idx");
+ indexFile.toFile().deleteOnExit();
+ writer = new IndexJournalWriterImpl(keywordLexicon, indexFile);
+
+ wordsFile1 = Files.createTempFile("words1", ".idx");
+ urlsFile1 = Files.createTempFile("urls1", ".idx");
+
+ dataDir = Files.createTempDirectory(getClass().getSimpleName());
+
+ for (int i = 1; i < workSetSize; i++) {
+ if (i < workSetStart) {
+ keywordLexicon.getOrInsert(Integer.toString(i));
+ }
+ else {
+ createEntry(writer, keywordLexicon, i);
+ }
+ }
+
+ keywordLexicon.commitToDisk();
+ Thread.sleep(1000);
+ writer.forceWrite();
+
+ var reader = new IndexJournalReaderSingleCompressedFile(indexFile);
+
+ wordsFile = dataDir.resolve("words.dat");
+ docsFile = dataDir.resolve("docs.dat");
+ }
+
+ @AfterEach
+ public void tearDown() {
+ TestUtil.clearTempDir(dataDir);
+ }
+
+ public int[] getFactorsI(int id) {
+ return IntStream.rangeClosed(1, id-1).toArray();
+ }
+ public long[] getFactorsL(int id) {
+ return LongStream.rangeClosed(1, id).filter(v -> (id % v) == 0).toArray();
+ }
+
+ long createId(long url, long domain) {
+ return (domain << 32) | url;
+ }
+ public void createEntry(IndexJournalWriter writer, KeywordLexicon keywordLexicon, int id) {
+ int[] factors = getFactorsI(id);
+ var header = new IndexJournalEntryHeader(factors.length, createId(id, id/20), id % 5);
+
+ long[] data = new long[factors.length*2];
+ for (int i = 0; i < factors.length; i++) {
+ data[2*i] = keywordLexicon.getOrInsert(Integer.toString(factors[i]));
+ data[2*i + 1] = (i % 21 != 0) ? 0 : -factors[i];
+ }
+
+ writer.put(header, new IndexJournalEntryData(data));
+ }
+
+ @Test
+ void testRev2() throws IOException {
+
+ Path tmpDir = Path.of("/tmp");
+
+ new ReverseIndexPriorityConverter(tmpDir, new IndexJournalReaderSingleCompressedFile(indexFile), new DomainRankings(), wordsFile, docsFile).convert();
+
+ var reverseReader = new ReverseIndexPriorityReader(wordsFile, docsFile);
+
+ for (int i = workSetStart; i < workSetSize; i++) {
+
+ var es = reverseReader.priorityDocuments(i);
+ LongQueryBuffer lqb = new LongQueryBuffer(100);
+ while (es.hasMore()) {
+ lqb.reset();
+ es.read(lqb);
+ System.out.println(Arrays.toString(Arrays.copyOf(lqb.data, lqb.end)));
+ }
+ System.out.println("--");
+ }
+
+ TestUtil.clearTempDir(dataDir);
+ }
+
+
+ @Test
+ void testRevP() throws IOException {
+
+ Path tmpDir = Path.of("/tmp");
+
+ new ReverseIndexPriorityConverter(tmpDir, new IndexJournalReaderSingleCompressedFile(indexFile, null, ReverseIndexPriorityParameters::filterPriorityRecord), new DomainRankings(), wordsFile, docsFile).convert();
+
+ var reverseReader = new ReverseIndexPriorityReader(wordsFile, docsFile);
+
+ for (int i = workSetStart; i < workSetSize; i++) {
+
+ var es = reverseReader.priorityDocuments(i);
+ LongQueryBuffer lqb = new LongQueryBuffer(100);
+ while (es.hasMore()) {
+ lqb.reset();
+ es.read(lqb);
+ System.out.println(Arrays.toString(Arrays.copyOf(lqb.data, lqb.end)));
+ }
+ System.out.println("--");
+ }
+
+ TestUtil.clearTempDir(dataDir);
+ }
+
+}
\ No newline at end of file
diff --git a/code/services-core/index-service/src/main/java/nu/marginalia/index/IndexServicesFactory.java b/code/services-core/index-service/src/main/java/nu/marginalia/index/IndexServicesFactory.java
index d21334b1..ec43819a 100644
--- a/code/services-core/index-service/src/main/java/nu/marginalia/index/IndexServicesFactory.java
+++ b/code/services-core/index-service/src/main/java/nu/marginalia/index/IndexServicesFactory.java
@@ -8,10 +8,11 @@ import nu.marginalia.index.forward.ForwardIndexReader;
import nu.marginalia.index.journal.reader.IndexJournalReaderSingleCompressedFile;
import nu.marginalia.index.journal.writer.IndexJournalWriter;
import nu.marginalia.index.journal.writer.IndexJournalWriterImpl;
-import nu.marginalia.index.reverse.ReverseIndexConverter;
-import nu.marginalia.index.reverse.ReverseIndexPrioReader;
-import nu.marginalia.index.reverse.ReverseIndexPriorityParameters;
-import nu.marginalia.index.reverse.ReverseIndexReader;
+import nu.marginalia.index.priority.ReverseIndexPriorityConverter;
+import nu.marginalia.index.full.ReverseIndexFullConverter;
+import nu.marginalia.index.priority.ReverseIndexPriorityReader;
+import nu.marginalia.index.priority.ReverseIndexPriorityParameters;
+import nu.marginalia.index.full.ReverseIndexFullReader;
import nu.marginalia.lexicon.KeywordLexicon;
import nu.marginalia.ranking.DomainRankings;
import nu.marginalia.index.index.SearchIndexReader;
@@ -110,7 +111,7 @@ public class IndexServicesFactory {
logger.info("Converting full reverse index {}", source);
var journalReader = new IndexJournalReaderSingleCompressedFile(source);
- var converter = new ReverseIndexConverter(tmpFileDir,
+ var converter = new ReverseIndexFullConverter(tmpFileDir,
journalReader,
domainRankings,
revIndexWords.get(NEXT_PART).toPath(),
@@ -127,9 +128,10 @@ public class IndexServicesFactory {
logger.info("Converting priority reverse index {}", source);
- var journalReader = new IndexJournalReaderSingleCompressedFile(source, null, ReverseIndexPriorityParameters::filterPriorityRecord);
+ var journalReader = new IndexJournalReaderSingleCompressedFile(source, null,
+ ReverseIndexPriorityParameters::filterPriorityRecord);
- var converter = new ReverseIndexConverter(tmpFileDir,
+ var converter = new ReverseIndexPriorityConverter(tmpFileDir,
journalReader,
domainRankings,
revPrioIndexWords.get(NEXT_PART).toPath(),
@@ -164,17 +166,16 @@ public class IndexServicesFactory {
throw new RuntimeException(e);
}
- System.runFinalization();
System.gc();
}
- public ReverseIndexReader getReverseIndexReader() throws IOException {
- return new ReverseIndexReader(
+ public ReverseIndexFullReader getReverseIndexReader() throws IOException {
+ return new ReverseIndexFullReader(
revIndexWords.get(LIVE_PART).toPath(),
revIndexDoc.get(LIVE_PART).toPath());
}
- public ReverseIndexPrioReader getReverseIndexPrioReader() throws IOException {
- return new ReverseIndexPrioReader(
+ public ReverseIndexPriorityReader getReverseIndexPrioReader() throws IOException {
+ return new ReverseIndexPriorityReader(
revPrioIndexWords.get(LIVE_PART).toPath(),
revPrioIndexDoc.get(LIVE_PART).toPath());
}
diff --git a/code/services-core/index-service/src/main/java/nu/marginalia/index/index/SearchIndexQueryBuilder.java b/code/services-core/index-service/src/main/java/nu/marginalia/index/index/SearchIndexQueryBuilder.java
index 89160aae..80196a36 100644
--- a/code/services-core/index-service/src/main/java/nu/marginalia/index/index/SearchIndexQueryBuilder.java
+++ b/code/services-core/index-service/src/main/java/nu/marginalia/index/index/SearchIndexQueryBuilder.java
@@ -3,27 +3,27 @@ package nu.marginalia.index.index;
import nu.marginalia.index.query.IndexQuery;
import nu.marginalia.index.query.IndexQueryBuilder;
import nu.marginalia.index.query.filter.QueryFilterStepIf;
-import nu.marginalia.index.reverse.ReverseIndexReader;
+import nu.marginalia.index.full.ReverseIndexFullReader;
public class SearchIndexQueryBuilder implements IndexQueryBuilder {
private final IndexQuery query;
- private final ReverseIndexReader reverseIndexReader;
+ private final ReverseIndexFullReader reverseIndexFullReader;
- SearchIndexQueryBuilder(ReverseIndexReader reverseIndexReader, IndexQuery query) {
+ SearchIndexQueryBuilder(ReverseIndexFullReader reverseIndexFullReader, IndexQuery query) {
this.query = query;
- this.reverseIndexReader = reverseIndexReader;
+ this.reverseIndexFullReader = reverseIndexFullReader;
}
public IndexQueryBuilder also(int termId) {
- query.addInclusionFilter(reverseIndexReader.also(termId));
+ query.addInclusionFilter(reverseIndexFullReader.also(termId));
return this;
}
public IndexQueryBuilder not(int termId) {
- query.addInclusionFilter(reverseIndexReader.not(termId));
+ query.addInclusionFilter(reverseIndexFullReader.not(termId));
return this;
}
diff --git a/code/services-core/index-service/src/main/java/nu/marginalia/index/index/SearchIndexReader.java b/code/services-core/index-service/src/main/java/nu/marginalia/index/index/SearchIndexReader.java
index 792257af..fe134f6e 100644
--- a/code/services-core/index-service/src/main/java/nu/marginalia/index/index/SearchIndexReader.java
+++ b/code/services-core/index-service/src/main/java/nu/marginalia/index/index/SearchIndexReader.java
@@ -1,6 +1,5 @@
package nu.marginalia.index.index;
-import lombok.SneakyThrows;
import nu.marginalia.index.forward.ForwardIndexReader;
import nu.marginalia.index.forward.ParamMatchingQueryFilter;
import nu.marginalia.index.query.EntrySource;
@@ -8,9 +7,9 @@ import nu.marginalia.index.query.IndexQuery;
import nu.marginalia.index.query.IndexQueryBuilder;
import nu.marginalia.index.query.IndexQueryParams;
import nu.marginalia.index.query.filter.QueryFilterStepIf;
-import nu.marginalia.index.reverse.ReverseIndexPrioReader;
-import nu.marginalia.index.reverse.ReverseIndexReader;
-import nu.marginalia.index.reverse.query.ReverseIndexEntrySourceBehavior;
+import nu.marginalia.index.priority.ReverseIndexPriorityReader;
+import nu.marginalia.index.full.ReverseIndexFullReader;
+import nu.marginalia.index.query.ReverseIndexEntrySourceBehavior;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
@@ -21,33 +20,33 @@ public class SearchIndexReader {
private final Logger logger = LoggerFactory.getLogger(getClass());
private final ForwardIndexReader forwardIndexReader;
- private final ReverseIndexReader reverseIndexReader;
- private final ReverseIndexPrioReader reverseIndexPrioReader;
+ private final ReverseIndexFullReader reverseIndexFullReader;
+ private final ReverseIndexPriorityReader reverseIndexPriorityReader;
public SearchIndexReader(ForwardIndexReader forwardIndexReader,
- ReverseIndexReader reverseIndexReader,
- ReverseIndexPrioReader reverseIndexPrioReader) {
+ ReverseIndexFullReader reverseIndexFullReader,
+ ReverseIndexPriorityReader reverseIndexPriorityReader) {
this.forwardIndexReader = forwardIndexReader;
- this.reverseIndexReader = reverseIndexReader;
- this.reverseIndexPrioReader = reverseIndexPrioReader;
+ this.reverseIndexFullReader = reverseIndexFullReader;
+ this.reverseIndexPriorityReader = reverseIndexPriorityReader;
}
public IndexQueryBuilder findWordAsSentence(int[] wordIdsByFrequency) {
List entrySources = new ArrayList<>(1);
- entrySources.add(reverseIndexReader.documents(wordIdsByFrequency[0], ReverseIndexEntrySourceBehavior.DO_PREFER));
+ entrySources.add(reverseIndexFullReader.documents(wordIdsByFrequency[0], ReverseIndexEntrySourceBehavior.DO_PREFER));
- return new SearchIndexQueryBuilder(reverseIndexReader, new IndexQuery(entrySources));
+ return new SearchIndexQueryBuilder(reverseIndexFullReader, new IndexQuery(entrySources));
}
public IndexQueryBuilder findWordAsTopic(int[] wordIdsByFrequency) {
List entrySources = new ArrayList<>(wordIdsByFrequency.length);
for (int wordId : wordIdsByFrequency) {
- entrySources.add(reverseIndexPrioReader.priorityDocuments(wordId));
+ entrySources.add(reverseIndexPriorityReader.priorityDocuments(wordId));
}
- return new SearchIndexQueryBuilder(reverseIndexReader, new IndexQuery(entrySources));
+ return new SearchIndexQueryBuilder(reverseIndexFullReader, new IndexQuery(entrySources));
}
public IndexQueryBuilder findWordTopicDynamicMode(int[] wordIdsByFrequency) {
@@ -58,12 +57,12 @@ public class SearchIndexReader {
List entrySources = new ArrayList<>(wordIdsByFrequency.length + 1);
for (int wordId : wordIdsByFrequency) {
- entrySources.add(reverseIndexPrioReader.priorityDocuments(wordId));
+ entrySources.add(reverseIndexPriorityReader.priorityDocuments(wordId));
}
- entrySources.add(reverseIndexReader.documents(wordIdsByFrequency[0], ReverseIndexEntrySourceBehavior.DO_NOT_PREFER));
+ entrySources.add(reverseIndexFullReader.documents(wordIdsByFrequency[0], ReverseIndexEntrySourceBehavior.DO_NOT_PREFER));
- return new SearchIndexQueryBuilder(reverseIndexReader, new IndexQuery(entrySources));
+ return new SearchIndexQueryBuilder(reverseIndexFullReader, new IndexQuery(entrySources));
}
QueryFilterStepIf filterForParams(IndexQueryParams params) {
@@ -71,11 +70,11 @@ public class SearchIndexReader {
}
public long numHits(int word) {
- return reverseIndexReader.numDocuments(word);
+ return reverseIndexFullReader.numDocuments(word);
}
public long[] getMetadata(int wordId, long[] docIds) {
- return reverseIndexReader.getTermMeta(wordId, docIds);
+ return reverseIndexFullReader.getTermMeta(wordId, docIds);
}
public long getDocumentMetadata(long docId) {