From 36160988e29ec0fb05ef6dfe4c34243718719f60 Mon Sep 17 00:00:00 2001 From: Viktor Lofgren Date: Mon, 10 Jun 2024 15:09:06 +0200 Subject: [PATCH] (index) Integrate positions data with indexes WIP This change integrates the new positions data with the forward and reverse indexes. The ranking code is still only partially re-written. --- .../model/compiled/CompiledQueryInt.java | 6 +- .../model/compiled/CompiledQueryParser.java | 3 +- .../model/compiled/CqExpression.java | 12 + .../aggregate/CompiledQueryAggregates.java | 6 +- .../aggregate/CqIntMaxMinOperator.java | 5 +- .../searchquery/model/query/SearchQuery.java | 44 ++ .../model/results/SearchResultItem.java | 4 +- .../index/forward/ForwardIndexConverter.java | 4 +- .../index/forward/ForwardIndexReader.java | 12 +- .../forward/ForwardIndexConverterTest.java | 1 + .../model/IndexJournalEntryHeader.java | 3 + .../journal/reader/IndexJournalReadEntry.java | 15 +- .../journal/reader/IndexJournalReader.java | 2 +- .../reader/IndexJournalReaderSingleFile.java | 3 + .../reader/pointer/IndexJournalPointer.java | 13 + .../journal/writer/IndexJournalWriter.java | 1 - .../IndexJournalWriterSingleFileImpl.java | 30 +- .../index/journal/IndexJournalWriterTest.java | 100 ++++- .../marginalia/index/ReverseIndexReader.java | 48 ++- .../PositionsFileConstructor.java | 13 +- .../construction/ReverseIndexConstructor.java | 1 - .../ReversePreindexDocuments.java | 17 +- .../index/positions/PositionCodec.java | 25 ++ .../index/positions/PositionsFileReader.java | 39 ++ .../marginalia/index/positions/TermData.java | 21 + .../index/PositionsFileReaderTest.java | 63 +++ .../index/ReverseIndexReaderTest.java | 36 +- .../construction/ReversePreindexDocsTest.java | 10 +- .../ReversePreindexFinalizeTest.java | 6 - .../construction/TestJournalFactory.java | 14 +- .../nu/marginalia/index/IndexFactory.java | 10 +- .../nu/marginalia/index/IndexGrpcService.java | 5 +- .../index/index/CombinedIndexReader.java | 14 +- .../index/results/IndexMetadataService.java | 18 +- .../results/IndexResultValuationContext.java | 140 ++++--- .../results/IndexResultValuatorService.java | 104 +++-- .../TermMetadataForCombinedDocumentIds.java | 47 ++- .../results/model/ids/CombinedDocIdList.java | 4 + .../results/model/ids/DocMetadataList.java | 45 --- .../index/results/model/ids/TermIdList.java | 10 + .../results/model/ids/TermMetadataList.java | 55 +++ .../ranking/results/ResultValuator.java | 28 +- .../results/factors/Bm25FullGraphVisitor.java | 27 +- .../results/factors/TermCoherenceFactor.java | 55 +-- .../index/CombinedIndexReaderTest.java | 382 ++++++++++++++++++ ...IndexQueryServiceIntegrationSmokeTest.java | 113 ++++-- .../IndexQueryServiceIntegrationTest.java | 1 + .../ranking/results/ResultValuatorTest.java | 100 ----- .../factors/TermCoherenceFactorTest.java | 107 ----- .../marginalia/sequence/EliasGammaCodec.java | 22 +- .../sequence/GammaCodedSequence.java | 37 +- .../sequence/SequenceOperations.java | 86 ++++ .../nu/marginalia/sequence/io/BitReader.java | 4 + .../sequence/SequenceOperationsTest.java | 75 ++++ .../DocumentRecordKeywordsProjection.java | 5 +- .../loading/LoaderIndexJournalWriter.java | 13 +- .../documents/KeywordLoaderService.java | 1 + .../paperdoll/SearchServicePaperDoll.java | 2 +- 58 files changed, 1417 insertions(+), 650 deletions(-) create mode 100644 code/index/index-reverse/java/nu/marginalia/index/positions/PositionCodec.java create mode 100644 code/index/index-reverse/java/nu/marginalia/index/positions/PositionsFileReader.java create mode 100644 code/index/index-reverse/java/nu/marginalia/index/positions/TermData.java create mode 100644 code/index/index-reverse/test/nu/marginalia/index/PositionsFileReaderTest.java delete mode 100644 code/index/java/nu/marginalia/index/results/model/ids/DocMetadataList.java create mode 100644 code/index/java/nu/marginalia/index/results/model/ids/TermMetadataList.java create mode 100644 code/index/test/nu/marginalia/index/CombinedIndexReaderTest.java delete mode 100644 code/index/test/nu/marginalia/ranking/results/ResultValuatorTest.java delete mode 100644 code/index/test/nu/marginalia/ranking/results/factors/TermCoherenceFactorTest.java create mode 100644 code/libraries/coded-sequence/java/nu/marginalia/sequence/SequenceOperations.java create mode 100644 code/libraries/coded-sequence/test/nu/marginalia/sequence/SequenceOperationsTest.java diff --git a/code/functions/search-query/api/java/nu/marginalia/api/searchquery/model/compiled/CompiledQueryInt.java b/code/functions/search-query/api/java/nu/marginalia/api/searchquery/model/compiled/CompiledQueryInt.java index 9e26c35c..0f80d479 100644 --- a/code/functions/search-query/api/java/nu/marginalia/api/searchquery/model/compiled/CompiledQueryInt.java +++ b/code/functions/search-query/api/java/nu/marginalia/api/searchquery/model/compiled/CompiledQueryInt.java @@ -5,8 +5,8 @@ import java.util.stream.IntStream; /** A compiled index service query */ public class CompiledQueryInt { - private final CqExpression root; - private final CqDataInt data; + public final CqExpression root; + public final CqDataInt data; public CompiledQueryInt(CqExpression root, CqDataInt data) { this.root = root; @@ -26,7 +26,7 @@ public class CompiledQueryInt { return IntStream.range(0, data.size()); } - public long at(int index) { + public int at(int index) { return data.get(index); } diff --git a/code/functions/search-query/api/java/nu/marginalia/api/searchquery/model/compiled/CompiledQueryParser.java b/code/functions/search-query/api/java/nu/marginalia/api/searchquery/model/compiled/CompiledQueryParser.java index ae197fb9..ef379e5a 100644 --- a/code/functions/search-query/api/java/nu/marginalia/api/searchquery/model/compiled/CompiledQueryParser.java +++ b/code/functions/search-query/api/java/nu/marginalia/api/searchquery/model/compiled/CompiledQueryParser.java @@ -61,7 +61,8 @@ public class CompiledQueryParser { String[] cqData = new String[wordIds.size()]; wordIds.forEach((w, i) -> cqData[i] = w); - return new CompiledQuery<>(root, new CqData<>(cqData)); + + return root.newQuery(cqData); } diff --git a/code/functions/search-query/api/java/nu/marginalia/api/searchquery/model/compiled/CqExpression.java b/code/functions/search-query/api/java/nu/marginalia/api/searchquery/model/compiled/CqExpression.java index e9972526..3f0cca50 100644 --- a/code/functions/search-query/api/java/nu/marginalia/api/searchquery/model/compiled/CqExpression.java +++ b/code/functions/search-query/api/java/nu/marginalia/api/searchquery/model/compiled/CqExpression.java @@ -8,6 +8,18 @@ import java.util.stream.Stream; * */ public sealed interface CqExpression { + /** Create a new query for the provided data using this expression as the root */ + default CompiledQuery newQuery(T[] data) { + return new CompiledQuery<>(this, data); + } + /** Create a new query for the provided data using this expression as the root */ + default CompiledQueryInt newQuery(int[] data) { + return new CompiledQueryInt(this, new CqDataInt(data)); + } + /** Create a new query for the provided data using this expression as the root */ + default CompiledQueryLong newQuery(long[] data) { + return new CompiledQueryLong(this, new CqDataLong(data)); + } Stream stream(); diff --git a/code/functions/search-query/api/java/nu/marginalia/api/searchquery/model/compiled/aggregate/CompiledQueryAggregates.java b/code/functions/search-query/api/java/nu/marginalia/api/searchquery/model/compiled/aggregate/CompiledQueryAggregates.java index 7e8ca8ec..2ca45dca 100644 --- a/code/functions/search-query/api/java/nu/marginalia/api/searchquery/model/compiled/aggregate/CompiledQueryAggregates.java +++ b/code/functions/search-query/api/java/nu/marginalia/api/searchquery/model/compiled/aggregate/CompiledQueryAggregates.java @@ -2,6 +2,7 @@ package nu.marginalia.api.searchquery.model.compiled.aggregate; import it.unimi.dsi.fastutil.longs.LongSet; import nu.marginalia.api.searchquery.model.compiled.CompiledQuery; +import nu.marginalia.api.searchquery.model.compiled.CompiledQueryInt; import nu.marginalia.api.searchquery.model.compiled.CompiledQueryLong; import java.util.ArrayList; @@ -36,7 +37,10 @@ public class CompiledQueryAggregates { public static int intMaxMinAggregate(CompiledQuery query, ToIntFunction operator) { return query.root.visit(new CqIntMaxMinOperator(query, operator)); } - + /** Apply the operator to each leaf node, then return the highest minimum value found along any path */ + public static int intMaxMinAggregate(CompiledQueryInt query, IntUnaryOperator operator) { + return query.root.visit(new CqIntMaxMinOperator(query, operator)); + } /** Apply the operator to each leaf node, then return the highest minimum value found along any path */ public static int intMaxMinAggregate(CompiledQueryLong query, LongToIntFunction operator) { return query.root.visit(new CqIntMaxMinOperator(query, operator)); diff --git a/code/functions/search-query/api/java/nu/marginalia/api/searchquery/model/compiled/aggregate/CqIntMaxMinOperator.java b/code/functions/search-query/api/java/nu/marginalia/api/searchquery/model/compiled/aggregate/CqIntMaxMinOperator.java index 621dff73..c9712ed4 100644 --- a/code/functions/search-query/api/java/nu/marginalia/api/searchquery/model/compiled/aggregate/CqIntMaxMinOperator.java +++ b/code/functions/search-query/api/java/nu/marginalia/api/searchquery/model/compiled/aggregate/CqIntMaxMinOperator.java @@ -1,6 +1,7 @@ package nu.marginalia.api.searchquery.model.compiled.aggregate; import nu.marginalia.api.searchquery.model.compiled.CompiledQuery; +import nu.marginalia.api.searchquery.model.compiled.CompiledQueryInt; import nu.marginalia.api.searchquery.model.compiled.CompiledQueryLong; import nu.marginalia.api.searchquery.model.compiled.CqExpression; @@ -21,7 +22,9 @@ public class CqIntMaxMinOperator implements CqExpression.IntVisitor { public CqIntMaxMinOperator(CompiledQueryLong query, LongToIntFunction operator) { this.operator = idx -> operator.applyAsInt(query.at(idx)); } - + public CqIntMaxMinOperator(CompiledQueryInt query, IntUnaryOperator operator) { + this.operator = idx -> operator.applyAsInt(query.at(idx)); + } @Override public int onAnd(List parts) { int value = parts.getFirst().visit(this); diff --git a/code/functions/search-query/api/java/nu/marginalia/api/searchquery/model/query/SearchQuery.java b/code/functions/search-query/api/java/nu/marginalia/api/searchquery/model/query/SearchQuery.java index ffe02868..e33972c3 100644 --- a/code/functions/search-query/api/java/nu/marginalia/api/searchquery/model/query/SearchQuery.java +++ b/code/functions/search-query/api/java/nu/marginalia/api/searchquery/model/query/SearchQuery.java @@ -36,6 +36,10 @@ public class SearchQuery { @Deprecated // why does this exist? private double value = 0; + public static SearchQueryBuilder builder(String compiledQuery) { + return new SearchQueryBuilder(compiledQuery); + } + public SearchQuery() { this.compiledQuery = ""; this.searchTermsInclude = new ArrayList<>(); @@ -81,5 +85,45 @@ public class SearchQuery { return sb.toString(); } + public static class SearchQueryBuilder { + private final String compiledQuery; + private List searchTermsInclude = new ArrayList<>(); + private List searchTermsExclude = new ArrayList<>(); + private List searchTermsAdvice = new ArrayList<>(); + private List searchTermsPriority = new ArrayList<>(); + private List> searchTermCoherences = new ArrayList<>(); + private SearchQueryBuilder(String compiledQuery) { + this.compiledQuery = compiledQuery; + } + + public SearchQueryBuilder include(String... terms) { + searchTermsInclude.addAll(List.of(terms)); + return this; + } + + public SearchQueryBuilder exclude(String... terms) { + searchTermsExclude.addAll(List.of(terms)); + return this; + } + + public SearchQueryBuilder advice(String... terms) { + searchTermsAdvice.addAll(List.of(terms)); + return this; + } + + public SearchQueryBuilder priority(String... terms) { + searchTermsPriority.addAll(List.of(terms)); + return this; + } + + public SearchQueryBuilder coherences(String... coherences) { + searchTermCoherences.add(List.of(coherences)); + return this; + } + + public SearchQuery build() { + return new SearchQuery(compiledQuery, searchTermsInclude, searchTermsExclude, searchTermsAdvice, searchTermsPriority, searchTermCoherences); + } + } } diff --git a/code/functions/search-query/api/java/nu/marginalia/api/searchquery/model/results/SearchResultItem.java b/code/functions/search-query/api/java/nu/marginalia/api/searchquery/model/results/SearchResultItem.java index ad8b8cb1..f676a954 100644 --- a/code/functions/search-query/api/java/nu/marginalia/api/searchquery/model/results/SearchResultItem.java +++ b/code/functions/search-query/api/java/nu/marginalia/api/searchquery/model/results/SearchResultItem.java @@ -32,13 +32,11 @@ public class SearchResultItem implements Comparable { public SearchResultItem(long combinedId, long encodedDocMetadata, - int htmlFeatures, - boolean hasPrioTerm) { + int htmlFeatures) { this.combinedId = combinedId; this.encodedDocMetadata = encodedDocMetadata; this.keywordScores = new ArrayList<>(); this.htmlFeatures = htmlFeatures; - this.hasPrioTerm = hasPrioTerm; } diff --git a/code/index/index-forward/java/nu/marginalia/index/forward/ForwardIndexConverter.java b/code/index/index-forward/java/nu/marginalia/index/forward/ForwardIndexConverter.java index 80cf502b..7c3704ba 100644 --- a/code/index/index-forward/java/nu/marginalia/index/forward/ForwardIndexConverter.java +++ b/code/index/index-forward/java/nu/marginalia/index/forward/ForwardIndexConverter.java @@ -83,8 +83,10 @@ public class ForwardIndexConverter { int ranking = domainRankings.getRanking(domainId); long meta = DocumentMetadata.encodeRank(pointer.documentMeta(), ranking); + long features = pointer.documentFeatures() | ((long) pointer.documentSize() << 32L); + docFileData.set(entryOffset + ForwardIndexParameters.METADATA_OFFSET, meta); - docFileData.set(entryOffset + ForwardIndexParameters.FEATURES_OFFSET, pointer.documentFeatures()); + docFileData.set(entryOffset + ForwardIndexParameters.FEATURES_OFFSET, features); } progress.progress(TaskSteps.FORCE); diff --git a/code/index/index-forward/java/nu/marginalia/index/forward/ForwardIndexReader.java b/code/index/index-forward/java/nu/marginalia/index/forward/ForwardIndexReader.java index 5d26de82..f9393b45 100644 --- a/code/index/index-forward/java/nu/marginalia/index/forward/ForwardIndexReader.java +++ b/code/index/index-forward/java/nu/marginalia/index/forward/ForwardIndexReader.java @@ -82,9 +82,19 @@ public class ForwardIndexReader { long offset = idxForDoc(docId); if (offset < 0) return 0; - return (int) data.get(ENTRY_SIZE * offset + FEATURES_OFFSET); + return (int) (data.get(ENTRY_SIZE * offset + FEATURES_OFFSET) & 0xFFFF_FFFFL); } + public int getDocumentSize(long docId) { + assert UrlIdCodec.getRank(docId) == 0 : "Forward Index Reader fed dirty reverse index id"; + + long offset = idxForDoc(docId); + if (offset < 0) return 0; + + return (int) (data.get(ENTRY_SIZE * offset + FEATURES_OFFSET) >>> 32L); + } + + private int idxForDoc(long docId) { assert UrlIdCodec.getRank(docId) == 0 : "Forward Index Reader fed dirty reverse index id"; diff --git a/code/index/index-forward/test/nu/marginalia/index/forward/ForwardIndexConverterTest.java b/code/index/index-forward/test/nu/marginalia/index/forward/ForwardIndexConverterTest.java index b30f549f..5c02f648 100644 --- a/code/index/index-forward/test/nu/marginalia/index/forward/ForwardIndexConverterTest.java +++ b/code/index/index-forward/test/nu/marginalia/index/forward/ForwardIndexConverterTest.java @@ -79,6 +79,7 @@ class ForwardIndexConverterTest { writer.put( new IndexJournalEntryHeader(createId(id, id/20), id%3, + 15, (id % 5)), new IndexJournalEntryData( new String[]{}, diff --git a/code/index/index-journal/java/nu/marginalia/index/journal/model/IndexJournalEntryHeader.java b/code/index/index-journal/java/nu/marginalia/index/journal/model/IndexJournalEntryHeader.java index b0f3d41e..82dc904a 100644 --- a/code/index/index-journal/java/nu/marginalia/index/journal/model/IndexJournalEntryHeader.java +++ b/code/index/index-journal/java/nu/marginalia/index/journal/model/IndexJournalEntryHeader.java @@ -17,14 +17,17 @@ import nu.marginalia.model.idx.DocumentMetadata; */ public record IndexJournalEntryHeader(int entrySize, int documentFeatures, + int documentSize, long combinedId, long documentMeta) { public IndexJournalEntryHeader(long combinedId, int documentFeatures, + int documentSize, long documentMeta) { this(-1, documentFeatures, + documentSize, combinedId, documentMeta); } diff --git a/code/index/index-journal/java/nu/marginalia/index/journal/reader/IndexJournalReadEntry.java b/code/index/index-journal/java/nu/marginalia/index/journal/reader/IndexJournalReadEntry.java index 0f3a6ff2..aae65e81 100644 --- a/code/index/index-journal/java/nu/marginalia/index/journal/reader/IndexJournalReadEntry.java +++ b/code/index/index-journal/java/nu/marginalia/index/journal/reader/IndexJournalReadEntry.java @@ -28,12 +28,17 @@ public class IndexJournalReadEntry implements Iterable>> 48L); + final int docSize = (int) ((sizeBlock >>> 32L) & 0xFFFFL); + final int docFeatures = (int) (sizeBlock & 0xFFFF_FFFFL); final long docId = inputStream.readLong(); final long meta = inputStream.readLong(); + var header = new IndexJournalEntryHeader( - (int) (sizeBlock >>> 32L), - (int) (sizeBlock & 0xFFFF_FFFFL), + entrySize, + docFeatures, + docSize, docId, meta); @@ -57,6 +62,10 @@ public class IndexJournalReadEntry implements Iterable { public IndexJournalEntryTermData next() { // read the metadata for the term long termId = buffer.getLong(); - long meta = buffer.getLong(); + long meta = buffer.getShort(); // read the size of the sequence data int size = buffer.get() & 0xFF; diff --git a/code/index/index-journal/java/nu/marginalia/index/journal/reader/IndexJournalReader.java b/code/index/index-journal/java/nu/marginalia/index/journal/reader/IndexJournalReader.java index 2f57da61..2dd8d0e9 100644 --- a/code/index/index-journal/java/nu/marginalia/index/journal/reader/IndexJournalReader.java +++ b/code/index/index-journal/java/nu/marginalia/index/journal/reader/IndexJournalReader.java @@ -13,7 +13,7 @@ public interface IndexJournalReader { int FILE_HEADER_SIZE_BYTES = 8 * FILE_HEADER_SIZE_LONGS; int DOCUMENT_HEADER_SIZE_BYTES = 24; - int TERM_HEADER_SIZE_BYTES = 17; + int TERM_HEADER_SIZE_BYTES = 11; /** Create a reader for a single file. */ static IndexJournalReader singleFile(Path fileName) throws IOException { diff --git a/code/index/index-journal/java/nu/marginalia/index/journal/reader/IndexJournalReaderSingleFile.java b/code/index/index-journal/java/nu/marginalia/index/journal/reader/IndexJournalReaderSingleFile.java index 488d0dc6..d820f1e0 100644 --- a/code/index/index-journal/java/nu/marginalia/index/journal/reader/IndexJournalReaderSingleFile.java +++ b/code/index/index-journal/java/nu/marginalia/index/journal/reader/IndexJournalReaderSingleFile.java @@ -97,6 +97,9 @@ class SingleFileJournalPointer implements IndexJournalPointer { @Override public int documentFeatures() { return entry.documentFeatures(); } + @Override + public int documentSize() { return entry.documentSize(); } + /** Return an iterator over the terms in the current document. * This iterator is not valid after calling nextDocument(). */ diff --git a/code/index/index-journal/java/nu/marginalia/index/journal/reader/pointer/IndexJournalPointer.java b/code/index/index-journal/java/nu/marginalia/index/journal/reader/pointer/IndexJournalPointer.java index 59e65e27..68d21360 100644 --- a/code/index/index-journal/java/nu/marginalia/index/journal/reader/pointer/IndexJournalPointer.java +++ b/code/index/index-journal/java/nu/marginalia/index/journal/reader/pointer/IndexJournalPointer.java @@ -42,6 +42,8 @@ public interface IndexJournalPointer extends Iterable */ int documentFeatures(); + int documentSize(); + /** Concatenate a number of journal pointers */ static IndexJournalPointer concatenate(IndexJournalPointer... pointers) { if (pointers.length == 1) @@ -94,6 +96,11 @@ class JoiningJournalPointer implements IndexJournalPointer { return pointers[pIndex].documentFeatures(); } + @Override + public int documentSize() { + return pointers[pIndex].documentSize(); + } + @NotNull @Override public Iterator iterator() { @@ -146,6 +153,12 @@ class FilteringJournalPointer implements IndexJournalPointer { return base.documentFeatures(); } + + @Override + public int documentSize() { + return base.documentSize(); + } + @NotNull @Override public Iterator iterator() { diff --git a/code/index/index-journal/java/nu/marginalia/index/journal/writer/IndexJournalWriter.java b/code/index/index-journal/java/nu/marginalia/index/journal/writer/IndexJournalWriter.java index df9b6836..916cf7a6 100644 --- a/code/index/index-journal/java/nu/marginalia/index/journal/writer/IndexJournalWriter.java +++ b/code/index/index-journal/java/nu/marginalia/index/journal/writer/IndexJournalWriter.java @@ -2,7 +2,6 @@ package nu.marginalia.index.journal.writer; import nu.marginalia.index.journal.model.IndexJournalEntryData; import nu.marginalia.index.journal.model.IndexJournalEntryHeader; -import nu.marginalia.index.journal.model.IndexJournalEntryTermData; import java.io.IOException; diff --git a/code/index/index-journal/java/nu/marginalia/index/journal/writer/IndexJournalWriterSingleFileImpl.java b/code/index/index-journal/java/nu/marginalia/index/journal/writer/IndexJournalWriterSingleFileImpl.java index 59999138..e5ddac52 100644 --- a/code/index/index-journal/java/nu/marginalia/index/journal/writer/IndexJournalWriterSingleFileImpl.java +++ b/code/index/index-journal/java/nu/marginalia/index/journal/writer/IndexJournalWriterSingleFileImpl.java @@ -81,12 +81,6 @@ public class IndexJournalWriterSingleFileImpl implements IndexJournalWriter{ public int put(IndexJournalEntryHeader header, IndexJournalEntryData data) { - if (dataBuffer.capacity() - dataBuffer.position() < 3*8) { - dataBuffer.flip(); - compressingStream.compress(dataBuffer); - dataBuffer.clear(); - } - final long[] keywords = data.termIds(); final long[] metadata = data.metadata(); final var positions = data.positions(); @@ -94,16 +88,30 @@ public class IndexJournalWriterSingleFileImpl implements IndexJournalWriter{ int recordSize = 0; // document header size is 3 longs for (int i = 0; i < keywords.length; i++) { // term header size is 2 longs - recordSize += IndexJournalReader.TERM_HEADER_SIZE_BYTES + positions[i].size(); + recordSize += IndexJournalReader.TERM_HEADER_SIZE_BYTES + positions[i].bufferSize(); } - dataBuffer.putInt(recordSize); + if (recordSize > Short.MAX_VALUE) { + // This should never happen, but if it does, we should log it and deal with it in a way that doesn't corrupt the file + // (32 KB is *a lot* of data for a single document, larger than the uncompressed HTML of most documents) + logger.error("Omitting entry: Record size {} exceeds maximum representable size of {}", recordSize, Short.MAX_VALUE); + return 0; + } + + if (dataBuffer.capacity() - dataBuffer.position() < 3*8) { + dataBuffer.flip(); + compressingStream.compress(dataBuffer); + dataBuffer.clear(); + } + + dataBuffer.putShort((short) recordSize); + dataBuffer.putShort((short) Math.clamp(0, header.documentSize(), Short.MAX_VALUE)); dataBuffer.putInt(header.documentFeatures()); dataBuffer.putLong(header.combinedId()); dataBuffer.putLong(header.documentMeta()); for (int i = 0; i < keywords.length; i++) { - int requiredSize = IndexJournalReader.TERM_HEADER_SIZE_BYTES + positions[i].size(); + int requiredSize = IndexJournalReader.TERM_HEADER_SIZE_BYTES + positions[i].bufferSize(); if (dataBuffer.capacity() - dataBuffer.position() < requiredSize) { dataBuffer.flip(); @@ -112,8 +120,8 @@ public class IndexJournalWriterSingleFileImpl implements IndexJournalWriter{ } dataBuffer.putLong(keywords[i]); - dataBuffer.putLong(metadata[i]); - dataBuffer.put((byte) positions[i].size()); + dataBuffer.putShort((short) metadata[i]); + dataBuffer.put((byte) positions[i].bufferSize()); dataBuffer.put(positions[i].buffer()); } diff --git a/code/index/index-journal/test/nu/marginalia/index/journal/IndexJournalWriterTest.java b/code/index/index-journal/test/nu/marginalia/index/journal/IndexJournalWriterTest.java index b9cd49c1..84d72af3 100644 --- a/code/index/index-journal/test/nu/marginalia/index/journal/IndexJournalWriterTest.java +++ b/code/index/index-journal/test/nu/marginalia/index/journal/IndexJournalWriterTest.java @@ -1,6 +1,8 @@ package nu.marginalia.index.journal; import it.unimi.dsi.fastutil.ints.IntList; +import it.unimi.dsi.fastutil.longs.LongArrayList; +import it.unimi.dsi.fastutil.longs.LongList; import nu.marginalia.hash.MurmurHash3_128; import nu.marginalia.index.journal.model.IndexJournalEntryData; import nu.marginalia.index.journal.model.IndexJournalEntryHeader; @@ -8,6 +10,11 @@ import nu.marginalia.index.journal.model.IndexJournalEntryTermData; import nu.marginalia.index.journal.reader.IndexJournalReaderPagingImpl; import nu.marginalia.index.journal.writer.IndexJournalWriterSingleFileImpl; import nu.marginalia.index.journal.reader.IndexJournalReaderSingleFile; +import nu.marginalia.model.EdgeUrl; +import nu.marginalia.model.id.UrlIdCodec; +import nu.marginalia.model.idx.DocumentMetadata; +import nu.marginalia.model.idx.WordFlags; +import nu.marginalia.model.idx.WordMetadata; import nu.marginalia.sequence.GammaCodedSequence; import org.junit.jupiter.api.AfterEach; import org.junit.jupiter.api.Assertions; @@ -18,8 +25,9 @@ import java.io.IOException; import java.nio.ByteBuffer; import java.nio.file.Files; import java.nio.file.Path; -import java.util.Iterator; -import java.util.List; +import java.util.*; +import java.util.stream.Collectors; +import java.util.stream.IntStream; import static org.junit.jupiter.api.Assertions.*; @@ -52,7 +60,7 @@ public class IndexJournalWriterTest { public void testSingleFile() { try (var writer = new IndexJournalWriterSingleFileImpl(tempFile)) { // Write two documents with two terms each - writer.put(new IndexJournalEntryHeader(11, 22, 33), + writer.put(new IndexJournalEntryHeader(11, 22, 10, 33), new IndexJournalEntryData( new String[]{"word1", "word2"}, new long[]{44, 55}, @@ -61,7 +69,7 @@ public class IndexJournalWriterTest { gcs(2, 4, 6), }) ); - writer.put(new IndexJournalEntryHeader(12, 23, 34), + writer.put(new IndexJournalEntryHeader(12, 23, 11, 34), new IndexJournalEntryData( new String[]{"word1", "word2"}, new long[]{45, 56}, @@ -90,6 +98,7 @@ public class IndexJournalWriterTest { assertEquals(11, ptr.documentId()); assertEquals(22, ptr.documentFeatures()); assertEquals(33, ptr.documentMeta()); + assertEquals(10, ptr.documentSize()); iter = ptr.iterator(); @@ -116,6 +125,7 @@ public class IndexJournalWriterTest { assertEquals(12, ptr.documentId()); assertEquals(23, ptr.documentFeatures()); assertEquals(34, ptr.documentMeta()); + assertEquals(11, ptr.documentSize()); iter = ptr.iterator(); // Term 1 @@ -147,7 +157,7 @@ public class IndexJournalWriterTest { @Test public void testMultiFile() { try (var writer = new IndexJournalWriterSingleFileImpl(tempFile)) { - writer.put(new IndexJournalEntryHeader(11, 22, 33), + writer.put(new IndexJournalEntryHeader(11, 22, 10, 33), new IndexJournalEntryData( new String[]{"word1", "word2"}, new long[]{44, 55}, @@ -162,7 +172,7 @@ public class IndexJournalWriterTest { } try (var writer = new IndexJournalWriterSingleFileImpl(tempFile2)) { - writer.put(new IndexJournalEntryHeader(12, 23, 34), + writer.put(new IndexJournalEntryHeader(12, 23, 11, 34), new IndexJournalEntryData( new String[]{"word1", "word2"}, new long[]{45, 56}, @@ -191,6 +201,7 @@ public class IndexJournalWriterTest { assertEquals(11, ptr.documentId()); assertEquals(22, ptr.documentFeatures()); assertEquals(33, ptr.documentMeta()); + assertEquals(10, ptr.documentSize()); iter = ptr.iterator(); @@ -217,6 +228,7 @@ public class IndexJournalWriterTest { assertEquals(12, ptr.documentId()); assertEquals(23, ptr.documentFeatures()); assertEquals(34, ptr.documentMeta()); + assertEquals(11, ptr.documentSize()); iter = ptr.iterator(); // Term 1 @@ -249,7 +261,7 @@ public class IndexJournalWriterTest { public void testSingleFileIterTwice() { try (var writer = new IndexJournalWriterSingleFileImpl(tempFile)) { // Write two documents with two terms each - writer.put(new IndexJournalEntryHeader(11, 22, 33), + writer.put(new IndexJournalEntryHeader(11, 22, 10, 33), new IndexJournalEntryData( new String[]{"word1", "word2"}, new long[]{44, 55}, @@ -277,6 +289,7 @@ public class IndexJournalWriterTest { assertTrue(ptr.nextDocument()); assertEquals(11, ptr.documentId()); assertEquals(22, ptr.documentFeatures()); + assertEquals(10, ptr.documentSize()); assertEquals(33, ptr.documentMeta()); iter = ptr.iterator(); @@ -307,7 +320,7 @@ public class IndexJournalWriterTest { public void testFiltered() { try (var writer = new IndexJournalWriterSingleFileImpl(tempFile)) { // Write two documents with two terms each - writer.put(new IndexJournalEntryHeader(11, 22, 33), + writer.put(new IndexJournalEntryHeader(11, 22, 10, 33), new IndexJournalEntryData( new String[]{"word1", "word2"}, new long[]{44, 55}, @@ -316,7 +329,7 @@ public class IndexJournalWriterTest { gcs(2, 4, 6), }) ); - writer.put(new IndexJournalEntryHeader(12, 23, 34), + writer.put(new IndexJournalEntryHeader(12, 23, 11, 34), new IndexJournalEntryData( new String[]{"word1", "word2"}, new long[]{45, 56}, @@ -344,6 +357,7 @@ public class IndexJournalWriterTest { assertEquals(12, ptr.documentId()); assertEquals(23, ptr.documentFeatures()); assertEquals(34, ptr.documentMeta()); + assertEquals(11, ptr.documentSize()); iter = ptr.iterator(); // Term 1 @@ -364,4 +378,72 @@ public class IndexJournalWriterTest { } } + @Test + public void testIntegrationScenario() throws IOException { + Map wordMap = new HashMap<>(); + for (int i = 0; i < 512; i++) { + wordMap.put(hasher.hashKeyword(Integer.toString(i)), i); + } + try (var writer = new IndexJournalWriterSingleFileImpl(tempFile)) { + for (int idc = 1; idc < 512; idc++) { + int id = idc; + int[] factors = IntStream + .rangeClosed(1, id) + .filter(v -> (id % v) == 0) + .toArray(); + + System.out.println("id:" + id + " factors: " + Arrays.toString(factors)); + + long fullId = UrlIdCodec.encodeId((32 - (id % 32)), id); + + var header = new IndexJournalEntryHeader(factors.length, 0, 100, fullId, new DocumentMetadata(0, 0, 0, 0, id % 5, id, id % 20, (byte) 0).encode()); + + String[] keywords = IntStream.of(factors).mapToObj(Integer::toString).toArray(String[]::new); + long[] metadata = new long[factors.length]; + for (int i = 0; i < factors.length; i++) { + metadata[i] = new WordMetadata(i, EnumSet.of(WordFlags.Title)).encode(); + } + GammaCodedSequence[] positions = new GammaCodedSequence[factors.length]; + ByteBuffer wa = ByteBuffer.allocate(16); + for (int i = 0; i < factors.length; i++) { + positions[i] = GammaCodedSequence.generate(wa, i + 1); + } + + writer.put(header, new IndexJournalEntryData(keywords, metadata, positions)); + } + } + + try (var ptr = new IndexJournalReaderSingleFile(tempFile).newPointer()) { + while (ptr.nextDocument()) { + int ordinal = UrlIdCodec.getDocumentOrdinal(ptr.documentId()); + System.out.println(ordinal); + + var expectedFactors = + new LongArrayList(IntStream + .rangeClosed(1, ordinal) + .filter(v -> (ordinal % v) == 0) + .mapToObj(Integer::toString) + .mapToLong(hasher::hashKeyword) + .toArray()); + + LongList foundIds = new LongArrayList(); + + var iter = ptr.iterator(); + while (iter.hasNext()) { + var termData = iter.next(); + foundIds.add(termData.termId()); + } + + if (!expectedFactors.equals(foundIds)) { + System.out.println("Found: "); + System.out.println(foundIds.stream().map(fac -> wordMap.getOrDefault(fac, -1)).map(Objects::toString).collect(Collectors.joining(","))); + System.out.println("Expected: "); + System.out.println(expectedFactors.stream().map(fac -> wordMap.getOrDefault(fac, -1)).map(Objects::toString).collect(Collectors.joining(","))); + fail(); + } + assertEquals(expectedFactors, foundIds); + } + } + } + } diff --git a/code/index/index-reverse/java/nu/marginalia/index/ReverseIndexReader.java b/code/index/index-reverse/java/nu/marginalia/index/ReverseIndexReader.java index 72feb7fd..c7621427 100644 --- a/code/index/index-reverse/java/nu/marginalia/index/ReverseIndexReader.java +++ b/code/index/index-reverse/java/nu/marginalia/index/ReverseIndexReader.java @@ -3,6 +3,8 @@ package nu.marginalia.index; import nu.marginalia.array.LongArray; import nu.marginalia.array.LongArrayFactory; import nu.marginalia.btree.BTreeReader; +import nu.marginalia.index.positions.TermData; +import nu.marginalia.index.positions.PositionsFileReader; import nu.marginalia.index.query.EmptyEntrySource; import nu.marginalia.index.query.EntrySource; import nu.marginalia.index.query.ReverseIndexRejectFilter; @@ -14,9 +16,9 @@ import org.slf4j.Logger; import org.slf4j.LoggerFactory; import java.io.IOException; +import java.lang.foreign.Arena; import java.nio.file.Files; import java.nio.file.Path; -import java.util.Arrays; import java.util.concurrent.Executors; public class ReverseIndexReader { @@ -27,9 +29,16 @@ public class ReverseIndexReader { private final BTreeReader wordsBTreeReader; private final String name; - public ReverseIndexReader(String name, Path words, Path documents) throws IOException { + private final PositionsFileReader positionsFileReader; + + public ReverseIndexReader(String name, + Path words, + Path documents, + PositionsFileReader positionsFileReader) throws IOException { this.name = name; + this.positionsFileReader = positionsFileReader; + if (!Files.exists(words) || !Files.exists(documents)) { this.words = null; this.documents = null; @@ -133,31 +142,29 @@ public class ReverseIndexReader { offset); } - public long[] getTermMeta(long termId, long[] docIds) { + public TermData[] getTermData(Arena arena, + long termId, + long[] docIds) + { + var ret = new TermData[docIds.length]; + long offset = wordOffset(termId); if (offset < 0) { // This is likely a bug in the code, but we can't throw an exception here logger.debug("Missing offset for word {}", termId); - return new long[docIds.length]; + return ret; } - assert isUniqueAndSorted(docIds) : "The input array docIds is assumed to be unique and sorted, was " + Arrays.toString(docIds); - var reader = createReaderNew(offset); - return reader.queryData(docIds, 1); - } - private boolean isUniqueAndSorted(long[] ids) { - if (ids.length == 0) - return true; + // Read the size and offset of the position data + var offsets = reader.queryData(docIds, 1); - for (int i = 1; i < ids.length; i++) { - if(ids[i] <= ids[i-1]) - return false; + for (int i = 0; i < docIds.length; i++) { + ret[i] = positionsFileReader.getTermData(arena, offsets[i]); } - - return true; + return ret; } public void close() { @@ -166,5 +173,14 @@ public class ReverseIndexReader { if (words != null) words.close(); + + if (positionsFileReader != null) { + try { + positionsFileReader.close(); + } catch (IOException e) { + logger.error("Failed to close positions file reader", e); + } + } } + } diff --git a/code/index/index-reverse/java/nu/marginalia/index/construction/PositionsFileConstructor.java b/code/index/index-reverse/java/nu/marginalia/index/construction/PositionsFileConstructor.java index 80225e06..9cbd6b14 100644 --- a/code/index/index-reverse/java/nu/marginalia/index/construction/PositionsFileConstructor.java +++ b/code/index/index-reverse/java/nu/marginalia/index/construction/PositionsFileConstructor.java @@ -1,5 +1,6 @@ package nu.marginalia.index.construction; +import nu.marginalia.index.positions.PositionCodec; import nu.marginalia.sequence.GammaCodedSequence; import java.io.IOException; @@ -38,7 +39,7 @@ public class PositionsFileConstructor implements AutoCloseable { /** Add a term to the positions file * @param termMeta the term metadata * @param positions the positions of the term - * @return the offset of the term in the file + * @return the offset of the term in the file, with the size of the data in the highest byte */ public long add(byte termMeta, GammaCodedSequence positions) throws IOException { synchronized (file) { @@ -53,12 +54,20 @@ public class PositionsFileConstructor implements AutoCloseable { workBuffer.put(termMeta); workBuffer.put(positionBuffer); + long ret = PositionCodec.encode(size, offset); + offset += size; - return offset; + + return ret; } } public void close() throws IOException { + while (workBuffer.position() < workBuffer.limit()) { + workBuffer.flip(); + channel.write(workBuffer); + } + channel.force(false); channel.close(); } diff --git a/code/index/index-reverse/java/nu/marginalia/index/construction/ReverseIndexConstructor.java b/code/index/index-reverse/java/nu/marginalia/index/construction/ReverseIndexConstructor.java index 8ea5b491..9fa3ed93 100644 --- a/code/index/index-reverse/java/nu/marginalia/index/construction/ReverseIndexConstructor.java +++ b/code/index/index-reverse/java/nu/marginalia/index/construction/ReverseIndexConstructor.java @@ -7,7 +7,6 @@ import org.slf4j.Logger; import org.slf4j.LoggerFactory; import java.io.IOException; -import java.nio.file.Files; import java.nio.file.Path; import java.util.concurrent.atomic.AtomicInteger; diff --git a/code/index/index-reverse/java/nu/marginalia/index/construction/ReversePreindexDocuments.java b/code/index/index-reverse/java/nu/marginalia/index/construction/ReversePreindexDocuments.java index aa4fc98e..3f97061a 100644 --- a/code/index/index-reverse/java/nu/marginalia/index/construction/ReversePreindexDocuments.java +++ b/code/index/index-reverse/java/nu/marginalia/index/construction/ReversePreindexDocuments.java @@ -21,12 +21,14 @@ import java.util.concurrent.TimeUnit; * the associated ReversePreindexWordSegments data */ public class ReversePreindexDocuments { + public final LongArray documents; + private static PositionsFileConstructor positionsFileConstructor; - final Path file; - public final LongArray documents; private static final int RECORD_SIZE_LONGS = 2; private static final Logger logger = LoggerFactory.getLogger(ReversePreindexDocuments.class); + public final Path file; + public ReversePreindexDocuments(LongArray documents, Path file) { this.documents = documents; this.file = file; @@ -70,22 +72,25 @@ public class ReversePreindexDocuments { long fileSizeLongs = RECORD_SIZE_LONGS * segments.totalSize(); - try (RandomFileAssembler assembly = RandomFileAssembler.create(workDir, fileSizeLongs)) { + try (var assembly = RandomFileAssembler.create(workDir, fileSizeLongs); + var pointer = reader.newPointer()) + { var offsetMap = segments.asMap(RECORD_SIZE_LONGS); offsetMap.defaultReturnValue(0); - var pointer = reader.newPointer(); while (pointer.nextDocument()) { long rankEncodedId = docIdRewriter.rewriteDocId(pointer.documentId()); for (var termData : pointer) { long termId = termData.termId(); long offset = offsetMap.addTo(termId, RECORD_SIZE_LONGS); - long posOffset = positionsFileConstructor.add((byte) termData.metadata(), termData.positions()); + + // write position data to the positions file and get the offset + long encodedPosOffset = positionsFileConstructor.add((byte) termData.metadata(), termData.positions()); assembly.put(offset + 0, rankEncodedId); - assembly.put(offset + 1, posOffset); + assembly.put(offset + 1, encodedPosOffset); } } diff --git a/code/index/index-reverse/java/nu/marginalia/index/positions/PositionCodec.java b/code/index/index-reverse/java/nu/marginalia/index/positions/PositionCodec.java new file mode 100644 index 00000000..9df63eec --- /dev/null +++ b/code/index/index-reverse/java/nu/marginalia/index/positions/PositionCodec.java @@ -0,0 +1,25 @@ +package nu.marginalia.index.positions; + +/** A utility class for encoding and decoding position data offsets, + * the data is encoded by using the highest 16 bits to store the offset, + * and the remaining 48 bits to store the size of the data. + *

+ * This lets us address 256 TB of data, with up to 64 KB of position data for each term, + * which is ample headroom for both the size of the data and the number of positions. + * */ +public class PositionCodec { + + public static long encode(int length, long offset) { + assert decodeSize(offset) == 0 : "Offset must be less than 2^48"; + + return (long) length << 48 | offset; + } + + public static int decodeSize(long sizeEncodedOffset) { + return (int) ((sizeEncodedOffset & 0xFFFF_0000_0000_0000L) >>> 48); + } + public static long decodeOffset(long sizeEncodedOffset) { + return sizeEncodedOffset & 0x0000_FFFF_FFFF_FFFFL; + } + +} diff --git a/code/index/index-reverse/java/nu/marginalia/index/positions/PositionsFileReader.java b/code/index/index-reverse/java/nu/marginalia/index/positions/PositionsFileReader.java new file mode 100644 index 00000000..647b205e --- /dev/null +++ b/code/index/index-reverse/java/nu/marginalia/index/positions/PositionsFileReader.java @@ -0,0 +1,39 @@ +package nu.marginalia.index.positions; + +import java.io.IOException; +import java.lang.foreign.Arena; +import java.nio.channels.FileChannel; +import java.nio.file.Path; +import java.nio.file.StandardOpenOption; + +public class PositionsFileReader implements AutoCloseable { + private final FileChannel positions; + + public PositionsFileReader(Path positionsFile) throws IOException { + this.positions = FileChannel.open(positionsFile, StandardOpenOption.READ); + } + + /** Get the positions for a term in the index, as pointed out by the encoded offset; + * intermediate buffers are allocated from the provided arena allocator. */ + public TermData getTermData(Arena arena, long sizeEncodedOffset) { + int length = PositionCodec.decodeSize(sizeEncodedOffset); + long offset = PositionCodec.decodeOffset(sizeEncodedOffset); + + var segment = arena.allocate(length); + var buffer = segment.asByteBuffer(); + + try { + positions.read(buffer, offset); + } catch (IOException e) { + throw new RuntimeException(e); + } + + return new TermData(buffer); + } + + @Override + public void close() throws IOException { + positions.close(); + } + +} diff --git a/code/index/index-reverse/java/nu/marginalia/index/positions/TermData.java b/code/index/index-reverse/java/nu/marginalia/index/positions/TermData.java new file mode 100644 index 00000000..55458342 --- /dev/null +++ b/code/index/index-reverse/java/nu/marginalia/index/positions/TermData.java @@ -0,0 +1,21 @@ +package nu.marginalia.index.positions; + +import nu.marginalia.sequence.GammaCodedSequence; + +import java.nio.ByteBuffer; + +public class TermData { + private final ByteBuffer buffer; + + public TermData(ByteBuffer buffer) { + this.buffer = buffer; + } + + public byte flags() { + return buffer.get(0); + } + + public GammaCodedSequence positions() { + return new GammaCodedSequence(buffer, 1, buffer.capacity()); + } +} diff --git a/code/index/index-reverse/test/nu/marginalia/index/PositionsFileReaderTest.java b/code/index/index-reverse/test/nu/marginalia/index/PositionsFileReaderTest.java new file mode 100644 index 00000000..5dd2be3a --- /dev/null +++ b/code/index/index-reverse/test/nu/marginalia/index/PositionsFileReaderTest.java @@ -0,0 +1,63 @@ +package nu.marginalia.index; + +import it.unimi.dsi.fastutil.ints.IntList; +import nu.marginalia.index.construction.PositionsFileConstructor; +import nu.marginalia.index.positions.TermData; +import nu.marginalia.index.positions.PositionsFileReader; +import nu.marginalia.sequence.GammaCodedSequence; +import org.junit.jupiter.api.AfterEach; +import org.junit.jupiter.api.BeforeEach; +import org.junit.jupiter.api.Test; + +import java.io.IOException; +import java.lang.foreign.Arena; +import java.nio.ByteBuffer; +import java.nio.file.Files; +import java.nio.file.Path; + +import static org.junit.jupiter.api.Assertions.*; + +class PositionsFileReaderTest { + + Path file; + + @BeforeEach + void setUp() throws IOException { + file = Files.createTempFile("positions", "dat"); + } + @AfterEach + void tearDown() throws IOException { + Files.delete(file); + } + + @Test + void getTermData() throws IOException { + ByteBuffer workArea = ByteBuffer.allocate(8192); + long key1, key2, key3; + try (PositionsFileConstructor constructor = new PositionsFileConstructor(file)) { + key1 = constructor.add((byte) 43, GammaCodedSequence.generate(workArea, 1, 2, 3)); + key2 = constructor.add((byte) 51, GammaCodedSequence.generate(workArea, 2, 3, 5, 1000, 5000, 20241)); + key3 = constructor.add((byte) 61, GammaCodedSequence.generate(workArea, 3, 5, 7)); + } + + System.out.println("key1: " + Long.toHexString(key1)); + System.out.println("key2: " + Long.toHexString(key2)); + System.out.println("key3: " + Long.toHexString(key3)); + + try (Arena arena = Arena.ofConfined(); + PositionsFileReader reader = new PositionsFileReader(file)) + { + TermData data1 = reader.getTermData(arena, key1); + assertEquals(43, data1.flags()); + assertEquals(IntList.of( 1, 2, 3), data1.positions().values()); + + TermData data2 = reader.getTermData(arena, key2); + assertEquals(51, data2.flags()); + assertEquals(IntList.of(2, 3, 5, 1000, 5000, 20241), data2.positions().values()); + + TermData data3 = reader.getTermData(arena, key3); + assertEquals(61, data3.flags()); + assertEquals(IntList.of(3, 5, 7), data3.positions().values()); + } + } +} \ No newline at end of file diff --git a/code/index/index-reverse/test/nu/marginalia/index/ReverseIndexReaderTest.java b/code/index/index-reverse/test/nu/marginalia/index/ReverseIndexReaderTest.java index 981136ad..2d53dd2e 100644 --- a/code/index/index-reverse/test/nu/marginalia/index/ReverseIndexReaderTest.java +++ b/code/index/index-reverse/test/nu/marginalia/index/ReverseIndexReaderTest.java @@ -1,17 +1,19 @@ package nu.marginalia.index; +import it.unimi.dsi.fastutil.ints.IntList; import nu.marginalia.array.page.LongQueryBuffer; import nu.marginalia.index.construction.DocIdRewriter; import nu.marginalia.index.construction.PositionsFileConstructor; import nu.marginalia.index.construction.ReversePreindex; import nu.marginalia.index.construction.TestJournalFactory; import nu.marginalia.index.construction.TestJournalFactory.EntryDataWithWordMeta; +import nu.marginalia.index.positions.PositionsFileReader; import org.junit.jupiter.api.AfterEach; import org.junit.jupiter.api.BeforeEach; import org.junit.jupiter.api.Test; -import org.mockito.Mockito; import java.io.IOException; +import java.lang.foreign.Arena; import java.nio.file.Files; import java.nio.file.Path; import java.util.ArrayList; @@ -47,13 +49,18 @@ class ReverseIndexReaderTest { public void testSimple() throws IOException { var indexReader = createIndex( - new EntryDataWithWordMeta(100, 101, wm(50, 51)) + new EntryDataWithWordMeta(100, 101, wm(50, 51, 1, 3, 5)) ); assertEquals(1, indexReader.numDocuments(50)); - long[] meta = indexReader.getTermMeta(50, new long[] { 100 }); - assertArrayEquals(new long[] { 51 }, meta); + var positions = indexReader.getTermData(Arena.global(), 50, new long[] { 100 }); + + assertEquals(1, positions.length); + assertNotNull(positions[0]); + assertEquals((byte) 51, positions[0].flags()); + assertEquals(IntList.of(1, 3, 5), positions[0].positions().values()); + assertArrayEquals(new long[] { 100 }, readEntries(indexReader, 50)); } @@ -69,13 +76,8 @@ class ReverseIndexReaderTest { assertEquals(2, indexReader.numDocuments(51)); assertEquals(1, indexReader.numDocuments(52)); - assertArrayEquals(new long[] { 51 }, indexReader.getTermMeta(50, new long[] { 100 })); assertArrayEquals(new long[] { 100 }, readEntries(indexReader, 50)); - - assertArrayEquals(new long[] { 52, 53 }, indexReader.getTermMeta(51, new long[] { 100, 101 })); assertArrayEquals(new long[] { 100, 101 }, readEntries(indexReader, 51)); - - assertArrayEquals(new long[] { 54 }, indexReader.getTermMeta(52, new long[] { 101 })); assertArrayEquals(new long[] { 101 }, readEntries(indexReader, 52)); } @@ -91,18 +93,20 @@ class ReverseIndexReaderTest { private ReverseIndexReader createIndex(EntryDataWithWordMeta... scenario) throws IOException { var reader = journalFactory.createReader(scenario); - var preindex = ReversePreindex.constructPreindex(reader, - Mockito.mock(PositionsFileConstructor.class), - DocIdRewriter.identity(), tempDir); - + Path posFile = tempDir.resolve("positions.dat"); Path docsFile = tempDir.resolve("docs.dat"); Path wordsFile = tempDir.resolve("words.dat"); - preindex.finalizeIndex(docsFile, wordsFile); - preindex.delete(); + try (var positionsFileConstructor = new PositionsFileConstructor(posFile)) { + var preindex = ReversePreindex.constructPreindex(reader, + positionsFileConstructor, + DocIdRewriter.identity(), tempDir); + preindex.finalizeIndex(docsFile, wordsFile); + preindex.delete(); + } - return new ReverseIndexReader("test", wordsFile, docsFile); + return new ReverseIndexReader("test", wordsFile, docsFile, new PositionsFileReader(posFile)); } } \ No newline at end of file diff --git a/code/index/index-reverse/test/nu/marginalia/index/construction/ReversePreindexDocsTest.java b/code/index/index-reverse/test/nu/marginalia/index/construction/ReversePreindexDocsTest.java index e12dbad6..df378228 100644 --- a/code/index/index-reverse/test/nu/marginalia/index/construction/ReversePreindexDocsTest.java +++ b/code/index/index-reverse/test/nu/marginalia/index/construction/ReversePreindexDocsTest.java @@ -155,15 +155,15 @@ class ReversePreindexDocsTest { if (wordId != that.wordId) return false; if (start != that.start) return false; if (end != that.end) return false; - return Arrays.equals(data, that.data); + return data[0] == that.data[0]; //Arrays.equals(data, that.data); } @Override public int hashCode() { - int result = (int) (wordId ^ (wordId >>> 32)); - result = 31 * result + (int) (start ^ (start >>> 32)); - result = 31 * result + (int) (end ^ (end >>> 32)); - result = 31 * result + Arrays.hashCode(data); + int result = Long.hashCode(wordId); + result = 31 * result + Long.hashCode(start); + result = 31 * result + Long.hashCode(end); + result = 31 * result + Long.hashCode(data[0]); return result; } diff --git a/code/index/index-reverse/test/nu/marginalia/index/construction/ReversePreindexFinalizeTest.java b/code/index/index-reverse/test/nu/marginalia/index/construction/ReversePreindexFinalizeTest.java index d9f3cddc..e10c2c27 100644 --- a/code/index/index-reverse/test/nu/marginalia/index/construction/ReversePreindexFinalizeTest.java +++ b/code/index/index-reverse/test/nu/marginalia/index/construction/ReversePreindexFinalizeTest.java @@ -79,9 +79,7 @@ class ReversePreindexFinalizeTest { assertEquals(1, wordsHeader.numEntries()); assertEquals(100, docsArray.get(docsHeader.dataOffsetLongs() + 0)); - assertEquals(51, docsArray.get(docsHeader.dataOffsetLongs() + 1)); assertEquals(50, wordsArray.get(wordsHeader.dataOffsetLongs())); - assertEquals(0, wordsArray.get(wordsHeader.dataOffsetLongs() + 1)); } @@ -122,9 +120,7 @@ class ReversePreindexFinalizeTest { long offset2 = wordsArray.get(wordsHeader.dataOffsetLongs() + 3); assertEquals(50, wordsArray.get(wordsHeader.dataOffsetLongs())); - assertEquals(0, wordsArray.get(wordsHeader.dataOffsetLongs() + 1)); assertEquals(50, wordsArray.get(wordsHeader.dataOffsetLongs())); - assertEquals(0, wordsArray.get(wordsHeader.dataOffsetLongs() + 1)); BTreeHeader docsHeader; @@ -133,13 +129,11 @@ class ReversePreindexFinalizeTest { assertEquals(1, docsHeader.numEntries()); assertEquals(100, docsArray.get(docsHeader.dataOffsetLongs() + 0)); - assertEquals(51, docsArray.get(docsHeader.dataOffsetLongs() + 1)); docsHeader = new BTreeHeader(docsArray, offset2); System.out.println(docsHeader); assertEquals(1, docsHeader.numEntries()); assertEquals(101, docsArray.get(docsHeader.dataOffsetLongs() + 0)); - assertEquals(52, docsArray.get(docsHeader.dataOffsetLongs() + 1)); } } \ No newline at end of file diff --git a/code/index/index-reverse/test/nu/marginalia/index/construction/TestJournalFactory.java b/code/index/index-reverse/test/nu/marginalia/index/construction/TestJournalFactory.java index db262d9f..a4c15305 100644 --- a/code/index/index-reverse/test/nu/marginalia/index/construction/TestJournalFactory.java +++ b/code/index/index-reverse/test/nu/marginalia/index/construction/TestJournalFactory.java @@ -8,11 +8,13 @@ import nu.marginalia.index.journal.writer.IndexJournalWriterSingleFileImpl; import nu.marginalia.sequence.GammaCodedSequence; import java.io.IOException; +import java.nio.ByteBuffer; import java.nio.file.Files; import java.nio.file.Path; import java.util.ArrayList; import java.util.Arrays; import java.util.List; +import java.util.Objects; public class TestJournalFactory { Path tempDir = Files.createTempDirectory("journal"); @@ -50,10 +52,10 @@ public class TestJournalFactory { '}'; } } - public record WordWithMeta(long wordId, long meta) {} + public record WordWithMeta(long wordId, long meta, GammaCodedSequence gcs) {} - public static WordWithMeta wm(long wordId, long meta) { - return new WordWithMeta(wordId, meta); + public static WordWithMeta wm(long wordId, long meta, int... positions) { + return new WordWithMeta(wordId, meta, GammaCodedSequence.generate(ByteBuffer.allocate(128), positions)); } IndexJournalReader createReader(EntryData... entries) throws IOException { @@ -71,7 +73,7 @@ public class TestJournalFactory { positions[i] = new GammaCodedSequence(new byte[1]); } - writer.put(new IndexJournalEntryHeader(entries.length, 0, entry.docId, entry.docMeta), + writer.put(new IndexJournalEntryHeader(entries.length, 0, 15, entry.docId, entry.docMeta), new IndexJournalEntryData(termIds, meta, positions)); } writer.close(); @@ -91,10 +93,10 @@ public class TestJournalFactory { for (int i = 0; i < entry.wordIds.length; i++) { termIds[i] = entry.wordIds[i].wordId; meta[i] = entry.wordIds[i].meta; - positions[i] = new GammaCodedSequence(new byte[1]); + positions[i] = Objects.requireNonNullElseGet(entry.wordIds[i].gcs, () -> new GammaCodedSequence(new byte[1])); } - writer.put(new IndexJournalEntryHeader(entries.length, 0, entry.docId, entry.docMeta), + writer.put(new IndexJournalEntryHeader(entries.length, 0, 15, entry.docId, entry.docMeta), new IndexJournalEntryData(termIds, meta, positions)); } writer.close(); diff --git a/code/index/java/nu/marginalia/index/IndexFactory.java b/code/index/java/nu/marginalia/index/IndexFactory.java index a1d2f5a5..38fed31e 100644 --- a/code/index/java/nu/marginalia/index/IndexFactory.java +++ b/code/index/java/nu/marginalia/index/IndexFactory.java @@ -4,11 +4,10 @@ import com.google.inject.Inject; import com.google.inject.Singleton; import nu.marginalia.IndexLocations; import nu.marginalia.index.index.CombinedIndexReader; +import nu.marginalia.index.positions.PositionsFileReader; import nu.marginalia.storage.FileStorageService; import nu.marginalia.index.forward.ForwardIndexFileNames; import nu.marginalia.index.forward.ForwardIndexReader; -import org.slf4j.Logger; -import org.slf4j.LoggerFactory; import java.io.IOException; import java.nio.file.Files; @@ -40,17 +39,18 @@ public class IndexFactory { } public ReverseIndexReader getReverseIndexReader() throws IOException { - return new ReverseIndexReader("full", ReverseIndexFullFileNames.resolve(liveStorage, ReverseIndexFullFileNames.FileIdentifier.WORDS, ReverseIndexFullFileNames.FileVersion.CURRENT), - ReverseIndexFullFileNames.resolve(liveStorage, ReverseIndexFullFileNames.FileIdentifier.DOCS, ReverseIndexFullFileNames.FileVersion.CURRENT) + ReverseIndexFullFileNames.resolve(liveStorage, ReverseIndexFullFileNames.FileIdentifier.DOCS, ReverseIndexFullFileNames.FileVersion.CURRENT), + new PositionsFileReader(ReverseIndexFullFileNames.resolve(liveStorage, ReverseIndexFullFileNames.FileIdentifier.POSITIONS, ReverseIndexFullFileNames.FileVersion.CURRENT)) ); } public ReverseIndexReader getReverseIndexPrioReader() throws IOException { return new ReverseIndexReader("prio", ReverseIndexPrioFileNames.resolve(liveStorage, ReverseIndexPrioFileNames.FileIdentifier.WORDS, ReverseIndexPrioFileNames.FileVersion.CURRENT), - ReverseIndexPrioFileNames.resolve(liveStorage, ReverseIndexPrioFileNames.FileIdentifier.DOCS, ReverseIndexPrioFileNames.FileVersion.CURRENT) + ReverseIndexPrioFileNames.resolve(liveStorage, ReverseIndexPrioFileNames.FileIdentifier.DOCS, ReverseIndexPrioFileNames.FileVersion.CURRENT), + null ); } diff --git a/code/index/java/nu/marginalia/index/IndexGrpcService.java b/code/index/java/nu/marginalia/index/IndexGrpcService.java index 1c430014..ec78890c 100644 --- a/code/index/java/nu/marginalia/index/IndexGrpcService.java +++ b/code/index/java/nu/marginalia/index/IndexGrpcService.java @@ -281,10 +281,7 @@ public class IndexGrpcService extends IndexApiGrpc.IndexApiImplBase { awaitCompletion(); // Return the best results - return new SearchResultSet( - resultValuator.selectBestResults(parameters, - resultRankingContext, - resultHeap)); + return new SearchResultSet(resultValuator.selectBestResults(parameters, resultHeap)); } /** Wait for all tasks to complete */ diff --git a/code/index/java/nu/marginalia/index/index/CombinedIndexReader.java b/code/index/java/nu/marginalia/index/index/CombinedIndexReader.java index afc52094..5779b526 100644 --- a/code/index/java/nu/marginalia/index/index/CombinedIndexReader.java +++ b/code/index/java/nu/marginalia/index/index/CombinedIndexReader.java @@ -14,12 +14,13 @@ import nu.marginalia.index.query.IndexQueryBuilder; import nu.marginalia.index.query.filter.QueryFilterStepIf; import nu.marginalia.index.query.limit.SpecificationLimitType; import nu.marginalia.index.results.model.ids.CombinedDocIdList; -import nu.marginalia.index.results.model.ids.DocMetadataList; +import nu.marginalia.index.results.model.ids.TermMetadataList; import nu.marginalia.model.id.UrlIdCodec; import nu.marginalia.model.idx.DocumentMetadata; import org.slf4j.Logger; import org.slf4j.LoggerFactory; +import java.lang.foreign.Arena; import java.time.Duration; import java.util.ArrayList; import java.util.Collections; @@ -169,8 +170,11 @@ public class CombinedIndexReader { } /** Retrieves the term metadata for the specified word for the provided documents */ - public DocMetadataList getMetadata(long wordId, CombinedDocIdList docIds) { - return new DocMetadataList(reverseIndexFullReader.getTermMeta(wordId, docIds.array())); + public TermMetadataList getTermMetadata(Arena arena, + long wordId, + CombinedDocIdList docIds) + { + return new TermMetadataList(reverseIndexFullReader.getTermData(arena, wordId, docIds.array())); } /** Retrieves the document metadata for the specified document */ @@ -186,8 +190,12 @@ public class CombinedIndexReader { /** Retrieves the HTML features for the specified document */ public int getHtmlFeatures(long docId) { return forwardIndexReader.getHtmlFeatures(docId); + } /** Retrieves the HTML features for the specified document */ + public int getDocumentSize(long docId) { + return forwardIndexReader.getDocumentSize(docId); } + /** Close the indexes (this is not done immediately) * */ public void close() throws InterruptedException { diff --git a/code/index/java/nu/marginalia/index/results/IndexMetadataService.java b/code/index/java/nu/marginalia/index/results/IndexMetadataService.java index d068c0f4..4ee34b42 100644 --- a/code/index/java/nu/marginalia/index/results/IndexMetadataService.java +++ b/code/index/java/nu/marginalia/index/results/IndexMetadataService.java @@ -10,12 +10,13 @@ import nu.marginalia.index.index.StatefulIndex; import nu.marginalia.index.model.SearchTermsUtil; import nu.marginalia.index.results.model.QuerySearchTerms; import nu.marginalia.index.results.model.TermCoherenceGroupList; -import nu.marginalia.index.results.model.TermMetadataForCombinedDocumentIds; import nu.marginalia.index.results.model.ids.CombinedDocIdList; +import nu.marginalia.index.results.model.ids.TermMetadataList; import nu.marginalia.index.results.model.ids.TermIdList; +import java.lang.foreign.Arena; + import static nu.marginalia.index.results.model.TermCoherenceGroupList.TermCoherenceGroup; -import static nu.marginalia.index.results.model.TermMetadataForCombinedDocumentIds.DocumentsWithMetadata; public class IndexMetadataService { private final StatefulIndex statefulIndex; @@ -25,22 +26,19 @@ public class IndexMetadataService { this.statefulIndex = index; } - public TermMetadataForCombinedDocumentIds getTermMetadataForDocuments(CombinedDocIdList combinedIdsAll, - TermIdList termIdsList) + public Long2ObjectArrayMap + getTermMetadataForDocuments(Arena arena, CombinedDocIdList combinedIdsAll, TermIdList termIdsList) { var currentIndex = statefulIndex.get(); - Long2ObjectArrayMap termdocToMeta = + Long2ObjectArrayMap termdocToMeta = new Long2ObjectArrayMap<>(termIdsList.size()); for (long termId : termIdsList.array()) { - var metadata = currentIndex.getMetadata(termId, combinedIdsAll); - - termdocToMeta.put(termId, - new DocumentsWithMetadata(combinedIdsAll, metadata)); + termdocToMeta.put(termId, currentIndex.getTermMetadata(arena, termId, combinedIdsAll)); } - return new TermMetadataForCombinedDocumentIds(termdocToMeta); + return termdocToMeta; } public QuerySearchTerms getSearchTerms(CompiledQuery compiledQuery, SearchQuery searchQuery) { diff --git a/code/index/java/nu/marginalia/index/results/IndexResultValuationContext.java b/code/index/java/nu/marginalia/index/results/IndexResultValuationContext.java index 0fc4bdc1..3972c272 100644 --- a/code/index/java/nu/marginalia/index/results/IndexResultValuationContext.java +++ b/code/index/java/nu/marginalia/index/results/IndexResultValuationContext.java @@ -1,25 +1,22 @@ package nu.marginalia.index.results; import nu.marginalia.api.searchquery.model.compiled.*; -import nu.marginalia.api.searchquery.model.compiled.aggregate.CompiledQueryAggregates; import nu.marginalia.api.searchquery.model.results.ResultRankingContext; import nu.marginalia.api.searchquery.model.results.SearchResultItem; -import nu.marginalia.api.searchquery.model.results.SearchResultKeywordScore; import nu.marginalia.index.index.CombinedIndexReader; import nu.marginalia.index.index.StatefulIndex; import nu.marginalia.index.model.SearchParameters; -import nu.marginalia.index.results.model.ids.CombinedDocIdList; import nu.marginalia.index.model.QueryParams; import nu.marginalia.index.results.model.QuerySearchTerms; -import nu.marginalia.index.results.model.TermMetadataForCombinedDocumentIds; import nu.marginalia.model.id.UrlIdCodec; import nu.marginalia.model.idx.WordFlags; -import nu.marginalia.model.idx.WordMetadata; import nu.marginalia.index.query.limit.QueryStrategy; import nu.marginalia.ranking.results.ResultValuator; +import nu.marginalia.sequence.GammaCodedSequence; import javax.annotation.Nullable; -import java.util.List; + +import static nu.marginalia.api.searchquery.model.compiled.aggregate.CompiledQueryAggregates.*; /** This class is responsible for calculating the score of a search result. * It holds the data required to perform the scoring, as there is strong @@ -28,94 +25,74 @@ public class IndexResultValuationContext { private final CombinedIndexReader index; private final QueryParams queryParams; - private final TermMetadataForCombinedDocumentIds termMetadataForCombinedDocumentIds; - private final QuerySearchTerms searchTerms; - private final ResultRankingContext rankingContext; private final ResultValuator searchResultValuator; private final CompiledQuery compiledQuery; - private final CompiledQueryLong compiledQueryIds; - public IndexResultValuationContext(IndexMetadataService metadataService, - ResultValuator searchResultValuator, - CombinedDocIdList ids, + public IndexResultValuationContext(ResultValuator searchResultValuator, StatefulIndex statefulIndex, ResultRankingContext rankingContext, - SearchParameters params - ) { + SearchParameters params) + { this.index = statefulIndex.get(); this.rankingContext = rankingContext; this.searchResultValuator = searchResultValuator; this.queryParams = params.queryParams; this.compiledQuery = params.compiledQuery; - this.compiledQueryIds = params.compiledQueryIds; - - this.searchTerms = metadataService.getSearchTerms(params.compiledQuery, params.query); - - this.termMetadataForCombinedDocumentIds = metadataService.getTermMetadataForDocuments(ids, - searchTerms.termIdsAll); } - private final long flagsFilterMask = - WordFlags.Title.asBit() | WordFlags.Subjects.asBit() | WordFlags.UrlDomain.asBit() | WordFlags.UrlPath.asBit() | WordFlags.ExternalLink.asBit(); + private final long flagsFilterMask = WordFlags.Title.asBit() | WordFlags.Subjects.asBit() | WordFlags.UrlDomain.asBit() | WordFlags.UrlPath.asBit() | WordFlags.ExternalLink.asBit(); @Nullable - public SearchResultItem calculatePreliminaryScore(long combinedId) { + public SearchResultItem calculatePreliminaryScore(long combinedId, + QuerySearchTerms searchTerms, + long[] wordFlags, + GammaCodedSequence[] positions) + { + + + // FIXME: Reconsider coherence logic with the new position data +// if (!searchTerms.coherences.test(termMetadataForCombinedDocumentIds, combinedId)) +// return null; + + CompiledQuery positionsQuery = compiledQuery.root.newQuery(positions); + CompiledQueryLong wordFlagsQuery = compiledQuery.root.newQuery(wordFlags); + int[] counts = new int[compiledQuery.size()]; + for (int i = 0; i < counts.length; i++) { + if (positions[i] != null) { + counts[i] = positions[i].valueCount(); + } + } + CompiledQueryInt positionsCountQuery = compiledQuery.root.newQuery(counts); + + // If the document is not relevant to the query, abort early to reduce allocations and + // avoid unnecessary calculations + if (testRelevance(wordFlagsQuery, positionsCountQuery)) { + return null; + } + long docId = UrlIdCodec.removeRank(combinedId); - - if (!searchTerms.coherences.test(termMetadataForCombinedDocumentIds, combinedId)) - return null; - long docMetadata = index.getDocumentMetadata(docId); int htmlFeatures = index.getHtmlFeatures(docId); - - SearchResultItem searchResult = new SearchResultItem(docId, - docMetadata, - htmlFeatures, - hasPrioTerm(combinedId)); - - long[] wordMetas = new long[compiledQuery.size()]; - SearchResultKeywordScore[] scores = new SearchResultKeywordScore[compiledQuery.size()]; - - for (int i = 0; i < wordMetas.length; i++) { - final long termId = compiledQueryIds.at(i); - final String term = compiledQuery.at(i); - - wordMetas[i] = termMetadataForCombinedDocumentIds.getTermMetadata(termId, combinedId); - scores[i] = new SearchResultKeywordScore(term, termId, wordMetas[i]); - } - - - // DANGER: IndexResultValuatorService assumes that searchResult.keywordScores has this specific order, as it needs - // to be able to re-construct its own CompiledQuery for re-ranking the results. This is - // a very flimsy assumption. - searchResult.keywordScores.addAll(List.of(scores)); - - CompiledQueryLong wordMetasQuery = new CompiledQueryLong(compiledQuery.root, new CqDataLong(wordMetas)); - - - boolean allSynthetic = CompiledQueryAggregates.booleanAggregate(wordMetasQuery, WordFlags.Synthetic::isPresent); - int flagsCount = CompiledQueryAggregates.intMaxMinAggregate(wordMetasQuery, wordMeta -> Long.bitCount(wordMeta & flagsFilterMask)); - int positionsCount = CompiledQueryAggregates.intMaxMinAggregate(wordMetasQuery, wordMeta -> Long.bitCount(WordMetadata.decodePositions(wordMeta))); - - if (!meetsQueryStrategyRequirements(wordMetasQuery, queryParams.queryStrategy())) { - return null; - } - - if (flagsCount == 0 && !allSynthetic && positionsCount == 0) - return null; + int docSize = index.getDocumentSize(docId); double score = searchResultValuator.calculateSearchResultValue( - wordMetasQuery, + wordFlagsQuery, + positionsCountQuery, + positionsQuery, docMetadata, htmlFeatures, - 5000, // use a dummy value here as it's not present in the index + docSize, rankingContext, null); - if (searchResult.hasPrioTerm) { + SearchResultItem searchResult = new SearchResultItem(docId, + docMetadata, + htmlFeatures); + + if (hasPrioTerm(searchTerms, positions)) { score = 0.75 * score; } @@ -124,13 +101,32 @@ public class IndexResultValuationContext { return searchResult; } - private boolean hasPrioTerm(long combinedId) { - for (var term : searchTerms.termIdsPrio.array()) { - if (termMetadataForCombinedDocumentIds.hasTermMeta(term, combinedId)) { + private boolean testRelevance(CompiledQueryLong wordFlagsQuery, CompiledQueryInt countsQuery) { + boolean allSynthetic = booleanAggregate(wordFlagsQuery, WordFlags.Synthetic::isPresent); + int flagsCount = intMaxMinAggregate(wordFlagsQuery, flags -> Long.bitCount(flags & flagsFilterMask)); + int positionsCount = intMaxMinAggregate(countsQuery, p -> p); + + if (!meetsQueryStrategyRequirements(wordFlagsQuery, queryParams.queryStrategy())) { + return true; + } + if (flagsCount == 0 && !allSynthetic && positionsCount == 0) { + return true; + } + + return false; + } + + private boolean hasPrioTerm(QuerySearchTerms searchTerms, GammaCodedSequence[] positions) { + var allTerms = searchTerms.termIdsAll; + var prioTerms = searchTerms.termIdsPrio; + + for (int i = 0; i < allTerms.size(); i++) { + if (positions[i] != null && prioTerms.contains(allTerms.at(i))) { return true; } } - return false; + + return false; } private boolean meetsQueryStrategyRequirements(CompiledQueryLong queryGraphScores, @@ -142,7 +138,7 @@ public class IndexResultValuationContext { return true; } - return CompiledQueryAggregates.booleanAggregate(queryGraphScores, + return booleanAggregate(queryGraphScores, docs -> meetsQueryStrategyRequirements(docs, queryParams.queryStrategy())); } diff --git a/code/index/java/nu/marginalia/index/results/IndexResultValuatorService.java b/code/index/java/nu/marginalia/index/results/IndexResultValuatorService.java index baecb564..fbe99cb1 100644 --- a/code/index/java/nu/marginalia/index/results/IndexResultValuatorService.java +++ b/code/index/java/nu/marginalia/index/results/IndexResultValuatorService.java @@ -7,8 +7,6 @@ import gnu.trove.list.array.TLongArrayList; import it.unimi.dsi.fastutil.longs.LongSet; import nu.marginalia.api.searchquery.model.compiled.CompiledQuery; import nu.marginalia.api.searchquery.model.compiled.CompiledQueryLong; -import nu.marginalia.api.searchquery.model.compiled.CqDataInt; -import nu.marginalia.api.searchquery.model.compiled.CqDataLong; import nu.marginalia.api.searchquery.model.compiled.aggregate.CompiledQueryAggregates; import nu.marginalia.api.searchquery.model.results.DecoratedSearchResultItem; import nu.marginalia.api.searchquery.model.results.ResultRankingContext; @@ -21,12 +19,13 @@ import nu.marginalia.linkdb.docs.DocumentDbReader; import nu.marginalia.linkdb.model.DocdbUrlDetail; import nu.marginalia.model.idx.WordMetadata; import nu.marginalia.ranking.results.ResultValuator; +import nu.marginalia.sequence.GammaCodedSequence; import org.slf4j.Logger; import org.slf4j.LoggerFactory; +import java.lang.foreign.Arena; import java.sql.SQLException; import java.util.*; -import java.util.function.Consumer; @Singleton public class IndexResultValuatorService { @@ -53,35 +52,53 @@ public class IndexResultValuatorService { ResultRankingContext rankingContext, CombinedDocIdList resultIds) { - final var evaluator = createValuationContext(params, rankingContext, resultIds); + IndexResultValuationContext evaluator = + new IndexResultValuationContext(resultValuator, statefulIndex, rankingContext, params); List results = new ArrayList<>(resultIds.size()); - for (long id : resultIds.array()) { - var score = evaluator.calculatePreliminaryScore(id); - if (score != null) { - results.add(score); + try (var arena = Arena.ofConfined()) { + // Batch-fetch the word metadata for the documents + + var searchTerms = metadataService.getSearchTerms(params.compiledQuery, params.query); + var termsForDocs = metadataService.getTermMetadataForDocuments(arena, resultIds, searchTerms.termIdsAll); + + // Prepare data for the document. We do this outside of the calculation function to avoid + // hash lookups in the inner loop, as it's very hot code and we don't want thrashing in there; + // out here we can rely on implicit array ordering to match up the data. + + var ra = resultIds.array(); + long[] flags = new long[searchTerms.termIdsAll.size()]; + GammaCodedSequence[] positions = new GammaCodedSequence[searchTerms.termIdsAll.size()]; + + for (int i = 0; i < ra.length; i++) { + long id = ra[i]; + + // Prepare term-level data for the document + for (int ti = 0; ti < flags.length; ti++) { + long tid = searchTerms.termIdsAll.at(ti); + var tfd = termsForDocs.get(tid); + + assert tfd != null : "No term data for term " + ti; + + flags[ti] = tfd.flag(i); + positions[ti] = tfd.position(i); + } + + // Calculate the preliminary score + + var score = evaluator.calculatePreliminaryScore(id, searchTerms, flags, positions); + if (score != null) { + results.add(score); + } } + + return results; } - - return results; - } - - private IndexResultValuationContext createValuationContext(SearchParameters params, - ResultRankingContext rankingContext, - CombinedDocIdList resultIds) - { - return new IndexResultValuationContext(metadataService, - resultValuator, - resultIds, - statefulIndex, - rankingContext, - params); } public List selectBestResults(SearchParameters params, - ResultRankingContext rankingContext, Collection results) throws SQLException { var domainCountFilter = new IndexResultDomainDeduplicator(params.limitByDomain); @@ -101,14 +118,13 @@ public class IndexResultValuatorService { item.resultsFromDomain = domainCountFilter.getCount(item); } - return decorateAndRerank(resultsList, params.compiledQuery, rankingContext); + return decorateResults(resultsList, params.compiledQuery); } /** Decorate the result items with additional information from the link database * and calculate an updated ranking with the additional information */ - public List decorateAndRerank(List rawResults, - CompiledQuery compiledQuery, - ResultRankingContext rankingContext) + public List decorateResults(List rawResults, + CompiledQuery compiledQuery) throws SQLException { TLongList idsList = new TLongArrayList(rawResults.size()); @@ -131,42 +147,18 @@ public class IndexResultValuatorService { continue; } - // Reconstruct the compiledquery for re-valuation - // - // CAVEAT: This hinges on a very fragile that IndexResultValuationContext puts them in the same - // order as the data for the CompiledQuery. - long[] wordMetas = new long[compiledQuery.size()]; - - for (int i = 0; i < compiledQuery.size(); i++) { - var score = result.keywordScores.get(i); - wordMetas[i] = score.encodedWordMetadata(); - } - - CompiledQueryLong metaQuery = new CompiledQueryLong(compiledQuery.root, new CqDataLong(wordMetas)); - resultItems.add(createCombinedItem( result, - docData, - metaQuery, - rankingContext)); + docData)); } return resultItems; } private DecoratedSearchResultItem createCombinedItem(SearchResultItem result, - DocdbUrlDetail docData, - CompiledQueryLong wordMetas, - ResultRankingContext rankingContext) { + DocdbUrlDetail docData) { ResultRankingDetailsExtractor detailsExtractor = new ResultRankingDetailsExtractor(); - Consumer detailConsumer = rankingContext.params.exportDebugData ? detailsExtractor::set : null; - - double score = resultValuator.calculateSearchResultValue(wordMetas, - result.encodedDocMetadata, - result.htmlFeatures, - docData.wordsTotal(), - rankingContext, - detailConsumer); + // Consumer detailConsumer = rankingContext.params.exportDebugData ? detailsExtractor::set : null; return new DecoratedSearchResultItem( result, @@ -179,8 +171,8 @@ public class IndexResultValuatorService { docData.pubYear(), docData.dataHash(), docData.wordsTotal(), - bestPositions(wordMetas), - score, + 0L, //bestPositions(wordMetas), + result.getScore(), detailsExtractor.get() ); } diff --git a/code/index/java/nu/marginalia/index/results/model/TermMetadataForCombinedDocumentIds.java b/code/index/java/nu/marginalia/index/results/model/TermMetadataForCombinedDocumentIds.java index 3ef2f7ab..20069a55 100644 --- a/code/index/java/nu/marginalia/index/results/model/TermMetadataForCombinedDocumentIds.java +++ b/code/index/java/nu/marginalia/index/results/model/TermMetadataForCombinedDocumentIds.java @@ -1,26 +1,38 @@ package nu.marginalia.index.results.model; -import it.unimi.dsi.fastutil.longs.Long2LongOpenHashMap; import it.unimi.dsi.fastutil.longs.Long2ObjectArrayMap; +import it.unimi.dsi.fastutil.longs.Long2ObjectOpenHashMap; +import nu.marginalia.index.positions.TermData; import nu.marginalia.index.results.model.ids.CombinedDocIdList; -import nu.marginalia.index.results.model.ids.DocMetadataList; -import org.slf4j.Logger; -import org.slf4j.LoggerFactory; +import nu.marginalia.index.results.model.ids.TermMetadataList; +import nu.marginalia.sequence.GammaCodedSequence; + +import javax.annotation.Nullable; public class TermMetadataForCombinedDocumentIds { - private static final Logger logger = LoggerFactory.getLogger(TermMetadataForCombinedDocumentIds.class); private final Long2ObjectArrayMap termdocToMeta; public TermMetadataForCombinedDocumentIds(Long2ObjectArrayMap termdocToMeta) { this.termdocToMeta = termdocToMeta; } - public long getTermMetadata(long termId, long combinedId) { + public byte getTermMetadata(long termId, long combinedId) { var metaByCombinedId = termdocToMeta.get(termId); if (metaByCombinedId == null) { return 0; } - return metaByCombinedId.get(combinedId); + return metaByCombinedId.get(combinedId).flags(); + } + + @Nullable + public GammaCodedSequence getPositions(long termId, long combinedId) { + var metaByCombinedId = termdocToMeta.get(termId); + + if (metaByCombinedId == null) { + return null; + } + + return metaByCombinedId.get(combinedId).positions(); } public boolean hasTermMeta(long termId, long combinedId) { @@ -30,16 +42,25 @@ public class TermMetadataForCombinedDocumentIds { return false; } - return metaByCombinedId.get(combinedId) != 0; + return metaByCombinedId.data().containsKey(combinedId); } - public record DocumentsWithMetadata(Long2LongOpenHashMap data) { - public DocumentsWithMetadata(CombinedDocIdList combinedDocIdsAll, DocMetadataList metadata) { - this(new Long2LongOpenHashMap(combinedDocIdsAll.array(), metadata.array())); + public record DocumentsWithMetadata(Long2ObjectOpenHashMap data) { + public DocumentsWithMetadata(CombinedDocIdList combinedDocIdsAll, TermMetadataList metadata) { + this(new Long2ObjectOpenHashMap<>(combinedDocIdsAll.size())); + + long[] ids = combinedDocIdsAll.array(); + TermData[] data = metadata.array(); + + for (int i = 0; i < combinedDocIdsAll.size(); i++) { + if (data[i] != null) { + this.data.put(ids[i], data[i]); + } + } } - public long get(long combinedId) { - return data.getOrDefault(combinedId, 0); + public TermData get(long combinedId) { + return data.get(combinedId); } } } diff --git a/code/index/java/nu/marginalia/index/results/model/ids/CombinedDocIdList.java b/code/index/java/nu/marginalia/index/results/model/ids/CombinedDocIdList.java index 17bd17a1..7845f14f 100644 --- a/code/index/java/nu/marginalia/index/results/model/ids/CombinedDocIdList.java +++ b/code/index/java/nu/marginalia/index/results/model/ids/CombinedDocIdList.java @@ -15,6 +15,10 @@ import java.util.stream.LongStream; public final class CombinedDocIdList { private final long[] data; + public CombinedDocIdList(long... data) { + this.data = Arrays.copyOf(data, data.length); + } + public CombinedDocIdList(LongArrayList data) { this.data = data.toLongArray(); } diff --git a/code/index/java/nu/marginalia/index/results/model/ids/DocMetadataList.java b/code/index/java/nu/marginalia/index/results/model/ids/DocMetadataList.java deleted file mode 100644 index 0104f89c..00000000 --- a/code/index/java/nu/marginalia/index/results/model/ids/DocMetadataList.java +++ /dev/null @@ -1,45 +0,0 @@ -package nu.marginalia.index.results.model.ids; - -import it.unimi.dsi.fastutil.longs.LongArrayList; - -import java.util.Arrays; -import java.util.Objects; -import java.util.stream.LongStream; - -public final class DocMetadataList { - private final long[] array; - - public DocMetadataList(long[] array) { - this.array = array; - } - - public DocMetadataList(LongArrayList list) { - this(list.toLongArray()); - } - - public int size() { - return array.length; - } - - public LongStream stream() { - return LongStream.of(array); - } - - public long[] array() { - return array; - } - - @Override - public boolean equals(Object obj) { - if (obj == this) return true; - if (obj == null || obj.getClass() != this.getClass()) return false; - var that = (DocMetadataList) obj; - return Arrays.equals(this.array, that.array); - } - - @Override - public int hashCode() { - return Arrays.hashCode(array); - } - -} diff --git a/code/index/java/nu/marginalia/index/results/model/ids/TermIdList.java b/code/index/java/nu/marginalia/index/results/model/ids/TermIdList.java index f25ab1b9..903fef9f 100644 --- a/code/index/java/nu/marginalia/index/results/model/ids/TermIdList.java +++ b/code/index/java/nu/marginalia/index/results/model/ids/TermIdList.java @@ -11,6 +11,7 @@ public final class TermIdList { public TermIdList(long[] array) { this.array = array; + Arrays.sort(this.array); } public TermIdList(LongArrayList list) { @@ -29,6 +30,15 @@ public final class TermIdList { return array; } + public long at(int i) { + return array[i]; + } + + public boolean contains(long id) { + // Implicitly sorted + return Arrays.binarySearch(array, id) >= 0; + } + @Override public boolean equals(Object obj) { if (obj == this) return true; diff --git a/code/index/java/nu/marginalia/index/results/model/ids/TermMetadataList.java b/code/index/java/nu/marginalia/index/results/model/ids/TermMetadataList.java new file mode 100644 index 00000000..dd7ebbcb --- /dev/null +++ b/code/index/java/nu/marginalia/index/results/model/ids/TermMetadataList.java @@ -0,0 +1,55 @@ +package nu.marginalia.index.results.model.ids; + +import nu.marginalia.index.positions.TermData; +import nu.marginalia.sequence.GammaCodedSequence; + +import javax.annotation.Nullable; +import java.util.Arrays; + +public final class TermMetadataList { + private final TermData[] array; + + public TermMetadataList(TermData[] array) { + this.array = array; + } + + public int size() { + return array.length; + } + + public long flag(int i) { + if (array[i] == null) + return 0; + + return array[i].flags(); + } + + /** Returns the position data for the given document index, + * may be null if the term is not in the document + */ + @Nullable + public GammaCodedSequence position(int i) { + if (array[i] == null) + return null; + + return array[i].positions(); + } + + public TermData[] array() { + return array; + } + + @Override + public boolean equals(Object obj) { + if (obj == this) return true; + if (obj == null || obj.getClass() != this.getClass()) return false; + var that = (TermMetadataList) obj; + return Arrays.equals(this.array, that.array); + } + + @Override + public int hashCode() { + return Arrays.hashCode(array); + } + +} diff --git a/code/index/java/nu/marginalia/ranking/results/ResultValuator.java b/code/index/java/nu/marginalia/ranking/results/ResultValuator.java index 1e026b40..ae84a11e 100644 --- a/code/index/java/nu/marginalia/ranking/results/ResultValuator.java +++ b/code/index/java/nu/marginalia/ranking/results/ResultValuator.java @@ -1,5 +1,7 @@ package nu.marginalia.ranking.results; +import nu.marginalia.api.searchquery.model.compiled.CompiledQuery; +import nu.marginalia.api.searchquery.model.compiled.CompiledQueryInt; import nu.marginalia.api.searchquery.model.compiled.CompiledQueryLong; import nu.marginalia.api.searchquery.model.results.ResultRankingContext; import nu.marginalia.api.searchquery.model.results.ResultRankingParameters; @@ -14,6 +16,7 @@ import nu.marginalia.ranking.results.factors.*; import com.google.inject.Inject; import com.google.inject.Singleton; +import nu.marginalia.sequence.GammaCodedSequence; import org.slf4j.Logger; import org.slf4j.LoggerFactory; @@ -33,15 +36,15 @@ public class ResultValuator { this.termCoherenceFactor = termCoherenceFactor; } - public double calculateSearchResultValue(CompiledQueryLong wordMeta, - long documentMetadata, + public double calculateSearchResultValue(CompiledQueryLong wordFlagsQuery, + CompiledQueryInt positionsCountQuery, CompiledQuery positionsQuery, long documentMetadata, int features, int length, ResultRankingContext ctx, @Nullable Consumer detailsConsumer ) { - if (wordMeta.isEmpty()) + if (wordFlagsQuery.isEmpty()) return Double.MAX_VALUE; if (length < 0) { @@ -82,12 +85,11 @@ public class ResultValuator { + temporalBias + flagsPenalty; - double tcfOverlap = rankingParams.tcfOverlapWeight * termCoherenceFactor.calculateOverlap(wordMeta); - double tcfJaccard = rankingParams.tcfJaccardWeight * termCoherenceFactor.calculateAvgMutualJaccard(wordMeta, ctx); + // FIXME: need a weighting factor here + double tcfAvgDist = 25. / termCoherenceFactor.calculateAvgMinDistance(positionsQuery, ctx); - double bM25F = rankingParams.bm25FullWeight * wordMeta.root.visit(Bm25FullGraphVisitor.forRegular(rankingParams.fullParams, wordMeta.data, length, ctx)); - double bM25N = rankingParams.bm25NgramWeight * wordMeta.root.visit(Bm25FullGraphVisitor.forNgrams(rankingParams.fullParams, wordMeta.data, length, ctx)); - double bM25P = rankingParams.bm25PrioWeight * wordMeta.root.visit(new Bm25PrioGraphVisitor(rankingParams.prioParams, wordMeta.data, ctx)); + double bM25F = rankingParams.bm25FullWeight * wordFlagsQuery.root.visit(new Bm25FullGraphVisitor(rankingParams.fullParams, positionsCountQuery.data, length, ctx)); + double bM25P = rankingParams.bm25PrioWeight * wordFlagsQuery.root.visit(new Bm25PrioGraphVisitor(rankingParams.prioParams, wordFlagsQuery.data, ctx)); double overallPartPositive = Math.max(0, overallPart); double overallPartNegative = -Math.min(0, overallPart); @@ -112,10 +114,10 @@ public class ResultValuator { temporalBias, flagsPenalty, overallPart, - tcfOverlap, - tcfJaccard, + 0, + 0, bM25F, - bM25N, + 0, // FIXME: Remove from model bM25P) ); @@ -125,8 +127,8 @@ public class ResultValuator { // Renormalize to 0...15, where 0 is the best possible score; // this is a historical artifact of the original ranking function double ret = normalize( - tcfOverlap + tcfJaccard - + bM25F + bM25P + bM25N + tcfAvgDist + + bM25F + bM25P + overallPartPositive, overallPartNegative); diff --git a/code/index/java/nu/marginalia/ranking/results/factors/Bm25FullGraphVisitor.java b/code/index/java/nu/marginalia/ranking/results/factors/Bm25FullGraphVisitor.java index 4105ed6b..88a592bb 100644 --- a/code/index/java/nu/marginalia/ranking/results/factors/Bm25FullGraphVisitor.java +++ b/code/index/java/nu/marginalia/ranking/results/factors/Bm25FullGraphVisitor.java @@ -13,7 +13,7 @@ import java.util.List; public class Bm25FullGraphVisitor implements CqExpression.DoubleVisitor { private static final long AVG_LENGTH = 5000; - private final CqDataLong wordMetaData; + private final CqDataInt counts; private final CqDataInt frequencies; private final Bm25Parameters bm25Parameters; @@ -22,31 +22,16 @@ public class Bm25FullGraphVisitor implements CqExpression.DoubleVisitor { private final BitSet mask; - private Bm25FullGraphVisitor(Bm25Parameters bm25Parameters, - CqDataLong wordMetaData, + public Bm25FullGraphVisitor(Bm25Parameters bm25Parameters, + CqDataInt counts, int length, - BitSet mask, ResultRankingContext ctx) { this.length = length; this.bm25Parameters = bm25Parameters; this.docCount = ctx.termFreqDocCount(); - this.wordMetaData = wordMetaData; + this.counts = counts; this.frequencies = ctx.fullCounts; - this.mask = mask; - } - - public static Bm25FullGraphVisitor forRegular(Bm25Parameters bm25Parameters, - CqDataLong wordMetaData, - int length, - ResultRankingContext ctx) { - return new Bm25FullGraphVisitor(bm25Parameters, wordMetaData, length, ctx.regularMask, ctx); - } - - public static Bm25FullGraphVisitor forNgrams(Bm25Parameters bm25Parameters, - CqDataLong wordMetaData, - int length, - ResultRankingContext ctx) { - return new Bm25FullGraphVisitor(bm25Parameters, wordMetaData, length, ctx.ngramsMask, ctx); + this.mask = ctx.regularMask; } @Override @@ -73,7 +58,7 @@ public class Bm25FullGraphVisitor implements CqExpression.DoubleVisitor { return 0; } - double count = Long.bitCount(WordMetadata.decodePositions(wordMetaData.get(idx))); + double count = counts.get(idx); int freq = frequencies.get(idx); diff --git a/code/index/java/nu/marginalia/ranking/results/factors/TermCoherenceFactor.java b/code/index/java/nu/marginalia/ranking/results/factors/TermCoherenceFactor.java index 3bda0580..2ebef7cd 100644 --- a/code/index/java/nu/marginalia/ranking/results/factors/TermCoherenceFactor.java +++ b/code/index/java/nu/marginalia/ranking/results/factors/TermCoherenceFactor.java @@ -1,66 +1,44 @@ package nu.marginalia.ranking.results.factors; -import nu.marginalia.api.searchquery.model.compiled.CompiledQueryLong; -import nu.marginalia.api.searchquery.model.compiled.aggregate.CompiledQueryAggregates; +import nu.marginalia.api.searchquery.model.compiled.CompiledQuery; import nu.marginalia.api.searchquery.model.results.ResultRankingContext; -import nu.marginalia.model.idx.WordMetadata; +import nu.marginalia.sequence.GammaCodedSequence; +import nu.marginalia.sequence.SequenceOperations; /** Rewards documents where terms appear frequently within the same sentences */ public class TermCoherenceFactor { - /** Calculate a factor that rewards the best total position overlap - * between the terms in the query. This is high when all the terms - * found in the same sentences. - */ - public double calculateOverlap(CompiledQueryLong wordMetadataQuery) { - if (wordMetadataQuery.size() < 2) - return 0; - - long mask = CompiledQueryAggregates.longBitmaskAggregate(wordMetadataQuery, - score -> score >>> WordMetadata.POSITIONS_SHIFT); - - return bitsSetFactor(mask); - } - - /** Calculate a factor that rewards the best average mutual Jaccard index - * between the terms in the query. This is high when the several terms are frequently - * found in the same sentences. - */ - public double calculateAvgMutualJaccard(CompiledQueryLong wordMetadataQuery, ResultRankingContext ctx) { + public double calculateAvgMinDistance(CompiledQuery positions, ResultRankingContext ctx) { double sum = 0; int cnt = 0; - for (int i = 0; i < wordMetadataQuery.size(); i++) { + for (int i = 0; i < positions.size(); i++) { // Skip terms that are not in the regular mask if (!ctx.regularMask.get(i)) continue; - long imask = WordMetadata.decodePositions(wordMetadataQuery.at(i)); + var posi = positions.at(i); // Skip terms that are not in the document - if (imask == 0L) + if (posi == null) continue; - for (int j = i + 1; j < wordMetadataQuery.size(); j++) { + for (int j = i + 1; j < positions.size(); j++) { // Skip terms that are not in the regular mask if (!ctx.regularMask.get(j)) continue; - long jmask = WordMetadata.decodePositions(wordMetadataQuery.at(j)); + var posj = positions.at(j); // Skip terms that are not in the document - if (jmask == 0L) + if (posj == null) continue; - long quot = Long.bitCount(imask & jmask); - long rem = Long.bitCount(imask | jmask); - - // rem is always > 0 because imask and jmask are not both 0 - - sum += quot/(double) rem; + int distance = SequenceOperations.minDistance(posi.iterator(), posj.iterator()); + sum += distance; cnt++; } } @@ -68,15 +46,8 @@ public class TermCoherenceFactor { if (cnt > 0) { return sum / cnt; } else { - return 0; + return 1000.; } } - double bitsSetFactor(long mask) { - final int bitsSetInMask = Long.bitCount(mask); - - return Math.pow(bitsSetInMask/(double) WordMetadata.POSITIONS_COUNT, 0.25); - } - - } \ No newline at end of file diff --git a/code/index/test/nu/marginalia/index/CombinedIndexReaderTest.java b/code/index/test/nu/marginalia/index/CombinedIndexReaderTest.java new file mode 100644 index 00000000..cd23261e --- /dev/null +++ b/code/index/test/nu/marginalia/index/CombinedIndexReaderTest.java @@ -0,0 +1,382 @@ +package nu.marginalia.index; + +import com.google.inject.Guice; +import com.google.inject.Inject; +import it.unimi.dsi.fastutil.ints.IntList; +import it.unimi.dsi.fastutil.longs.LongArrayList; +import it.unimi.dsi.fastutil.longs.LongList; +import nu.marginalia.IndexLocations; +import nu.marginalia.array.page.LongQueryBuffer; +import nu.marginalia.hash.MurmurHash3_128; +import nu.marginalia.index.construction.DocIdRewriter; +import nu.marginalia.index.construction.ReverseIndexConstructor; +import nu.marginalia.index.domainrankings.DomainRankings; +import nu.marginalia.index.forward.ForwardIndexConverter; +import nu.marginalia.index.forward.ForwardIndexFileNames; +import nu.marginalia.index.index.CombinedIndexReader; +import nu.marginalia.index.index.StatefulIndex; +import nu.marginalia.index.journal.model.IndexJournalEntryData; +import nu.marginalia.index.journal.model.IndexJournalEntryHeader; +import nu.marginalia.index.journal.reader.IndexJournalReader; +import nu.marginalia.index.journal.writer.IndexJournalWriter; +import nu.marginalia.index.positions.TermData; +import nu.marginalia.index.results.model.ids.CombinedDocIdList; +import nu.marginalia.linkdb.docs.DocumentDbReader; +import nu.marginalia.linkdb.docs.DocumentDbWriter; +import nu.marginalia.linkdb.model.DocdbUrlDetail; +import nu.marginalia.model.EdgeUrl; +import nu.marginalia.model.id.UrlIdCodec; +import nu.marginalia.model.idx.DocumentFlags; +import nu.marginalia.model.idx.DocumentMetadata; +import nu.marginalia.model.idx.WordFlags; +import nu.marginalia.model.idx.WordMetadata; +import nu.marginalia.process.control.FakeProcessHeartbeat; +import nu.marginalia.process.control.ProcessHeartbeat; +import nu.marginalia.sequence.GammaCodedSequence; +import nu.marginalia.service.control.ServiceHeartbeat; +import nu.marginalia.service.server.Initialization; +import nu.marginalia.storage.FileStorageService; +import org.junit.jupiter.api.AfterEach; +import org.junit.jupiter.api.BeforeEach; +import org.junit.jupiter.api.Test; +import org.junit.jupiter.api.parallel.Execution; + +import java.io.IOException; +import java.lang.foreign.Arena; +import java.net.URISyntaxException; +import java.nio.ByteBuffer; +import java.nio.file.Files; +import java.nio.file.Path; +import java.sql.SQLException; +import java.util.*; + +import static nu.marginalia.linkdb.LinkdbFileNames.DOCDB_FILE_NAME; +import static org.junit.jupiter.api.Assertions.assertEquals; +import static org.junit.jupiter.api.parallel.ExecutionMode.SAME_THREAD; + +@Execution(SAME_THREAD) +public class CombinedIndexReaderTest { + + @Inject + Initialization initialization; + + IndexQueryServiceIntegrationTestModule testModule; + + @Inject + StatefulIndex statefulIndex; + + @Inject + IndexJournalWriter indexJournalWriter; + + @Inject + FileStorageService fileStorageService; + + @Inject + DomainRankings domainRankings; + + @Inject + ProcessHeartbeat processHeartbeat; + @Inject + DocumentDbReader documentDbReader; + + @Inject + IndexFactory indexFactory; + + @BeforeEach + public void setUp() throws IOException { + + testModule = new IndexQueryServiceIntegrationTestModule(); + Guice.createInjector(testModule).injectMembers(this); + + initialization.setReady(); + } + + @AfterEach + public void tearDown() throws IOException { + testModule.cleanUp(); + } + + private final MockDocumentMeta anyMetadata = new MockDocumentMeta(0, new DocumentMetadata(2, 0, 14, EnumSet.noneOf(DocumentFlags.class))); + + @Test + public void testSimpleRetrieval() throws Exception { + new MockData().add( + d(1, 1), + anyMetadata, + w("hello", WordFlags.Title, 33, 55), + w("world", WordFlags.Subjects, 34) + ).load(); + + var reader = indexFactory.getCombinedIndexReader(); + var query = reader.findFullWord(kw("hello")).build(); + + var buffer = new LongQueryBuffer(32); + query.getMoreResults(buffer); + + assertEquals( + List.of(d(1, 1)), + decode(buffer) + ); + + var helloMeta = td(reader, kw("hello"), d(1, 1)); + assertEquals(helloMeta.flags(), WordFlags.Title.asBit()); + assertEquals(IntList.of(33, 55), helloMeta.positions().values()); + + var worldMeta = td(reader, kw("world"), d(1, 1)); + assertEquals(worldMeta.flags(), WordFlags.Subjects.asBit()); + assertEquals(IntList.of(34), worldMeta.positions().values()); + } + + TermData td(CombinedIndexReader reader, long wordId, MockDataDocument docId) { + return (reader.getTermMetadata(Arena.global(), wordId, new CombinedDocIdList(docId.docId())).array())[0]; + } + + + @Test + public void testUnionRetrieval() throws Exception { + new MockData() + .add( + d(1, 1), + anyMetadata, + w("hello", WordFlags.Title), + w("world", WordFlags.Title) + ) + .add( + d(1, 2), + anyMetadata, + w("world", WordFlags.Title) + ) + .add( + d(1, 3), + anyMetadata, + w("world", WordFlags.Title) + ) + .add( + d(2, 4), + anyMetadata, + w("hello", WordFlags.Title), + w("world", WordFlags.Title) + ) + .load(); + + var reader = indexFactory.getCombinedIndexReader(); + var query = reader + .findFullWord(kw("hello")) + .also(kw("world")) + .build(); + + var buffer = new LongQueryBuffer(32); + query.getMoreResults(buffer); + + assertEquals( + List.of(d(1, 1), d(2, 4)), + decode(buffer) + ); + } + + @Test + public void testNotFilterRetrieval() throws Exception { + new MockData() + .add( + d(1, 1), + anyMetadata, + w("hello", WordFlags.Title), + w("world", WordFlags.Title), + w("goodbye", WordFlags.Title) + ) + .add( + d(1, 2), + anyMetadata, + w("world", WordFlags.Title) + ) + .add( + d(1, 3), + anyMetadata, + w("world", WordFlags.Title) + ) + .add( + d(2, 4), + anyMetadata, + w("hello", WordFlags.Title), + w("world", WordFlags.Title) + ) + .load(); + + var reader = indexFactory.getCombinedIndexReader(); + var query = reader.findFullWord(kw("hello")) + .also(kw("world")) + .not(kw("goodbye")) + .build(); + + var buffer = new LongQueryBuffer(32); + query.getMoreResults(buffer); + + assertEquals( + List.of(d(2, 4)), + decode(buffer) + ); + } + + List decode(LongQueryBuffer buffer) { + List result = new ArrayList<>(); + for (int i = 0; i < buffer.size(); i++) { + result.add(new MockDataDocument(buffer.data.get(i))); + } + return result; + } + + private MockDataDocument d(int domainId, int ordinal) { + return new MockDataDocument(domainId, ordinal); + } + + private void constructIndex() throws IOException { + createForwardIndex(); + createFullReverseIndex(); + createPrioReverseIndex(); + } + + private void createFullReverseIndex() throws IOException { + + Path outputFileDocs = ReverseIndexFullFileNames.resolve(IndexLocations.getCurrentIndex(fileStorageService), ReverseIndexFullFileNames.FileIdentifier.DOCS, ReverseIndexFullFileNames.FileVersion.NEXT); + Path outputFileWords = ReverseIndexFullFileNames.resolve(IndexLocations.getCurrentIndex(fileStorageService), ReverseIndexFullFileNames.FileIdentifier.WORDS, ReverseIndexFullFileNames.FileVersion.NEXT); + Path outputFilePositions = ReverseIndexFullFileNames.resolve(IndexLocations.getCurrentIndex(fileStorageService), ReverseIndexFullFileNames.FileIdentifier.POSITIONS, ReverseIndexFullFileNames.FileVersion.NEXT); + + Path workDir = IndexLocations.getIndexConstructionArea(fileStorageService); + Path tmpDir = workDir.resolve("tmp"); + + if (!Files.isDirectory(tmpDir)) Files.createDirectories(tmpDir); + + var constructor = + new ReverseIndexConstructor( + outputFileDocs, + outputFileWords, + outputFilePositions, + IndexJournalReader::singleFile, + DocIdRewriter.identity(), + tmpDir); + constructor.createReverseIndex(new FakeProcessHeartbeat(), "name", workDir); + } + + private void createPrioReverseIndex() throws IOException { + + Path outputFileDocs = ReverseIndexPrioFileNames.resolve(IndexLocations.getCurrentIndex(fileStorageService), ReverseIndexPrioFileNames.FileIdentifier.DOCS, ReverseIndexPrioFileNames.FileVersion.NEXT); + Path outputFileWords = ReverseIndexPrioFileNames.resolve(IndexLocations.getCurrentIndex(fileStorageService), ReverseIndexPrioFileNames.FileIdentifier.WORDS, ReverseIndexPrioFileNames.FileVersion.NEXT); + Path outputFilePositions = ReverseIndexPrioFileNames.resolve(IndexLocations.getCurrentIndex(fileStorageService), ReverseIndexPrioFileNames.FileIdentifier.POSITIONS, ReverseIndexPrioFileNames.FileVersion.NEXT); + Path workDir = IndexLocations.getIndexConstructionArea(fileStorageService); + Path tmpDir = workDir.resolve("tmp"); + + if (!Files.isDirectory(tmpDir)) Files.createDirectories(tmpDir); + + var constructor = new ReverseIndexConstructor( + outputFileDocs, + outputFileWords, + outputFilePositions, + IndexJournalReader::singleFile, + DocIdRewriter.identity(), + tmpDir); + + constructor.createReverseIndex(new FakeProcessHeartbeat(), "name", workDir); + } + + private void createForwardIndex() throws IOException { + + Path workDir = IndexLocations.getIndexConstructionArea(fileStorageService); + Path outputFileDocsId = ForwardIndexFileNames.resolve(IndexLocations.getCurrentIndex(fileStorageService), ForwardIndexFileNames.FileIdentifier.DOC_ID, ForwardIndexFileNames.FileVersion.NEXT); + Path outputFileDocsData = ForwardIndexFileNames.resolve(IndexLocations.getCurrentIndex(fileStorageService), ForwardIndexFileNames.FileIdentifier.DOC_DATA, ForwardIndexFileNames.FileVersion.NEXT); + + ForwardIndexConverter converter = new ForwardIndexConverter(processHeartbeat, + IndexJournalReader.paging(workDir), + outputFileDocsId, + outputFileDocsData, + domainRankings + ); + + converter.convert(); + } + + MurmurHash3_128 hasher = new MurmurHash3_128(); + + long kw(String s) { + return hasher.hashKeyword(s); + } + + class MockData { + private final Map> allData = new HashMap<>(); + private final Map metaByDoc = new HashMap<>(); + + public MockData add(MockDataDocument document, + MockDocumentMeta meta, + MockDataKeyword... words) + { + long id = UrlIdCodec.encodeId(document.domainId, document.ordinal); + + allData.computeIfAbsent(id, l -> new ArrayList<>()).addAll(List.of(words)); + metaByDoc.put(id, meta); + + return this; + } + + void load() throws IOException, SQLException, URISyntaxException { + allData.forEach((doc, words) -> { + + var meta = metaByDoc.get(doc); + + var header = new IndexJournalEntryHeader( + doc, + meta.features, + 100, + meta.documentMetadata.encode() + ); + + String[] keywords = words.stream().map(w -> w.keyword).toArray(String[]::new); + long[] metadata = words.stream().map(w -> w.termMetadata).mapToLong(Long::longValue).toArray(); + var positions = words.stream().map(w -> w.positions).map(pos -> GammaCodedSequence.generate(ByteBuffer.allocate(1024), pos.toIntArray())).toArray(GammaCodedSequence[]::new); + + indexJournalWriter.put(header, + new IndexJournalEntryData(keywords, metadata, positions)); + }); + + var linkdbWriter = new DocumentDbWriter( + IndexLocations.getLinkdbLivePath(fileStorageService).resolve(DOCDB_FILE_NAME) + ); + for (Long key : allData.keySet()) { + linkdbWriter.add(new DocdbUrlDetail( + key, + new EdgeUrl("https://www.example.com"), + "test", + "test", + 0., + "HTML5", + 0, + null, + 0, + 5 + )); + } + linkdbWriter.close(); + + indexJournalWriter.close(); + constructIndex(); + documentDbReader.reconnect(); + statefulIndex.switchIndex(); + } + } + + record MockDataDocument(int domainId, int ordinal) { + public MockDataDocument(long encodedId) { + this(UrlIdCodec.getDomainId(encodedId), UrlIdCodec.getDocumentOrdinal(encodedId)); + } + + public long docId() { + return UrlIdCodec.encodeId(domainId, ordinal); + } + + } + record MockDocumentMeta(int features, DocumentMetadata documentMetadata) {} + record MockDataKeyword(String keyword, long termMetadata, IntList positions) {} + + MockDataKeyword w(String keyword, WordFlags flags, int... positions) { + return new MockDataKeyword(keyword, new WordMetadata(0L, EnumSet.of(flags)).encode(), IntList.of(positions)); + + } +} diff --git a/code/index/test/nu/marginalia/index/IndexQueryServiceIntegrationSmokeTest.java b/code/index/test/nu/marginalia/index/IndexQueryServiceIntegrationSmokeTest.java index 1af355f6..e5040157 100644 --- a/code/index/test/nu/marginalia/index/IndexQueryServiceIntegrationSmokeTest.java +++ b/code/index/test/nu/marginalia/index/IndexQueryServiceIntegrationSmokeTest.java @@ -13,7 +13,6 @@ import nu.marginalia.process.control.FakeProcessHeartbeat; import nu.marginalia.process.control.ProcessHeartbeat; import nu.marginalia.sequence.GammaCodedSequence; import nu.marginalia.storage.FileStorageService; -import nu.marginalia.hash.MurmurHash3_128; import nu.marginalia.index.construction.DocIdRewriter; import nu.marginalia.index.construction.ReverseIndexConstructor; import nu.marginalia.index.forward.ForwardIndexConverter; @@ -142,6 +141,53 @@ public class IndexQueryServiceIntegrationSmokeTest { Assertions.assertArrayEquals(ids, actual); } + @Test + public void testSimple() throws Exception { + var linkdbWriter = new DocumentDbWriter( + IndexLocations.getLinkdbLivePath(fileStorageService) + .resolve(DOCDB_FILE_NAME) + ); + for (int i = 1; i < 512; i++) { + loadData(linkdbWriter, i); + } + linkdbWriter.close(); + documentDbReader.reconnect(); + + indexJournalWriter.close(); + constructIndex(); + statefulIndex.switchIndex(); + + var rsp = queryService.justQuery( + SearchSpecification.builder() + .queryLimits(new QueryLimits(10, 10, Integer.MAX_VALUE, 4000)) + .queryStrategy(QueryStrategy.SENTENCE) + .year(SpecificationLimit.none()) + .quality(SpecificationLimit.none()) + .size(SpecificationLimit.none()) + .rank(SpecificationLimit.none()) + .rankingParams(ResultRankingParameters.sensibleDefaults()) + .domains(new ArrayList<>()) + .searchSetIdentifier("NONE") + .query( + SearchQuery.builder("2") + .include("2") + .build() + ).build() + ); + + int[] idxes = new int[] { 62, 222, 382, 60, 124, 220, 284, 380, 444, 122 }; + long[] ids = IntStream.of(idxes).mapToLong(Long::valueOf).toArray(); + long[] actual = rsp.results + .stream() + .mapToLong(i -> i.rawIndexResult.getDocumentId()) + .map(UrlIdCodec::getDocumentOrdinal) + .toArray(); + + System.out.println(Arrays.toString(actual)); + System.out.println(Arrays.toString(ids)); + Assertions.assertArrayEquals(ids, actual); + } + @Test public void testDomainQuery() throws Exception { @@ -297,7 +343,6 @@ public class IndexQueryServiceIntegrationSmokeTest { return UrlIdCodec.encodeId((32 - (id % 32)), id); } - MurmurHash3_128 hasher = new MurmurHash3_128(); @SneakyThrows public void loadData(DocumentDbWriter ldbw, int id) { int[] factors = IntStream @@ -305,22 +350,44 @@ public class IndexQueryServiceIntegrationSmokeTest { .filter(v -> (id % v) == 0) .toArray(); + System.out.println("id:" + id + " factors: " + Arrays.toString(factors)); + long fullId = fullId(id); - var header = new IndexJournalEntryHeader(factors.length, 0, fullId, new DocumentMetadata(0, 0, 0, 0, id % 5, id, id % 20, (byte) 0).encode()); - - long[] data = new long[factors.length * 2]; - for (int i = 0; i < factors.length; i++) { - data[2 * i] = hasher.hashNearlyASCII(Integer.toString(factors[i])); - data[2 * i + 1] = new WordMetadata(i, EnumSet.of(WordFlags.Title)).encode(); - } + var header = new IndexJournalEntryHeader(factors.length, 0, 100, fullId, new DocumentMetadata(0, 0, 0, 0, id % 5, id, id % 20, (byte) 0).encode()); ldbw.add(new DocdbUrlDetail( fullId, new EdgeUrl("https://www.example.com/"+id), "test", "test", 0., "HTML5", 0, null, 0, 10 )); - String[] keywords = IntStream.range(0, factors.length).mapToObj(Integer::toString).toArray(String[]::new); + String[] keywords = IntStream.of(factors).mapToObj(Integer::toString).toArray(String[]::new); + long[] metadata = new long[factors.length]; + for (int i = 0; i < factors.length; i++) { + metadata[i] = new WordMetadata(i, EnumSet.of(WordFlags.Title)).encode(); + } + GammaCodedSequence[] positions = new GammaCodedSequence[factors.length]; + ByteBuffer wa = ByteBuffer.allocate(32); + for (int i = 0; i < factors.length; i++) { + positions[i] = GammaCodedSequence.generate(wa, factors); + } + + indexJournalWriter.put(header, new IndexJournalEntryData(keywords, metadata, positions)); + } + + @SneakyThrows + public void loadDataWithDomain(DocumentDbWriter ldbw, int domain, int id) { + int[] factors = IntStream.rangeClosed(1, id).filter(v -> (id % v) == 0).toArray(); + long fullId = UrlIdCodec.encodeId(domain, id); + var header = new IndexJournalEntryHeader(factors.length, 0, 100, fullId, DocumentMetadata.defaultValue()); + + ldbw.add(new DocdbUrlDetail( + fullId, new EdgeUrl("https://www.example.com/"+id), + "test", "test", 0., "HTML5", 0, null, 0, 10 + )); + + + String[] keywords = IntStream.of(factors).mapToObj(Integer::toString).toArray(String[]::new); long[] metadata = new long[factors.length]; for (int i = 0; i < factors.length; i++) { metadata[i] = new WordMetadata(i, EnumSet.of(WordFlags.Title)).encode(); @@ -334,30 +401,4 @@ public class IndexQueryServiceIntegrationSmokeTest { indexJournalWriter.put(header, new IndexJournalEntryData(keywords, metadata, positions)); } - @SneakyThrows - public void loadDataWithDomain(DocumentDbWriter ldbw, int domain, int id) { - int[] factors = IntStream.rangeClosed(1, id).filter(v -> (id % v) == 0).toArray(); - long fullId = UrlIdCodec.encodeId(domain, id); - var header = new IndexJournalEntryHeader(factors.length, 0, fullId, DocumentMetadata.defaultValue()); - - ldbw.add(new DocdbUrlDetail( - fullId, new EdgeUrl("https://www.example.com/"+id), - "test", "test", 0., "HTML5", 0, null, 0, 10 - )); - - - String[] keywords = IntStream.range(0, factors.length).mapToObj(Integer::toString).toArray(String[]::new); - long[] metadata = new long[factors.length]; - for (int i = 0; i < factors.length; i++) { - metadata[i] = new WordMetadata(i, EnumSet.of(WordFlags.Title)).encode(); - } - GammaCodedSequence[] positions = new GammaCodedSequence[factors.length]; - ByteBuffer wa = ByteBuffer.allocate(16); - for (int i = 0; i < factors.length; i++) { - positions[i] = GammaCodedSequence.generate(wa, i); - } - - indexJournalWriter.put(header, new IndexJournalEntryData(keywords, metadata, positions)); - } - } diff --git a/code/index/test/nu/marginalia/index/IndexQueryServiceIntegrationTest.java b/code/index/test/nu/marginalia/index/IndexQueryServiceIntegrationTest.java index 9e9c3873..0251a471 100644 --- a/code/index/test/nu/marginalia/index/IndexQueryServiceIntegrationTest.java +++ b/code/index/test/nu/marginalia/index/IndexQueryServiceIntegrationTest.java @@ -565,6 +565,7 @@ public class IndexQueryServiceIntegrationTest { var header = new IndexJournalEntryHeader( doc, meta.features, + 100, meta.documentMetadata.encode() ); diff --git a/code/index/test/nu/marginalia/ranking/results/ResultValuatorTest.java b/code/index/test/nu/marginalia/ranking/results/ResultValuatorTest.java deleted file mode 100644 index 41906904..00000000 --- a/code/index/test/nu/marginalia/ranking/results/ResultValuatorTest.java +++ /dev/null @@ -1,100 +0,0 @@ -package nu.marginalia.ranking.results; - -import nu.marginalia.api.searchquery.model.compiled.CompiledQuery; -import nu.marginalia.api.searchquery.model.compiled.CompiledQueryLong; -import nu.marginalia.api.searchquery.model.compiled.CqDataInt; -import nu.marginalia.api.searchquery.model.results.ResultRankingContext; -import nu.marginalia.api.searchquery.model.results.ResultRankingParameters; -import nu.marginalia.api.searchquery.model.results.SearchResultKeywordScore; -import nu.marginalia.model.idx.DocumentFlags; -import nu.marginalia.model.idx.WordFlags; -import nu.marginalia.model.crawl.PubDate; -import nu.marginalia.model.idx.DocumentMetadata; -import nu.marginalia.model.idx.WordMetadata; -import nu.marginalia.ranking.results.factors.*; -import nu.marginalia.term_frequency_dict.TermFrequencyDict; -import org.junit.jupiter.api.BeforeEach; -import org.junit.jupiter.api.Test; -import org.mockito.Mockito; - -import java.util.*; - -import static org.mockito.Mockito.when; - -class ResultValuatorTest { - - TermFrequencyDict dict; - ResultValuator valuator; - - @BeforeEach - public void setUp() { - - dict = Mockito.mock(TermFrequencyDict.class); - when(dict.docCount()).thenReturn(100_000); - - valuator = new ResultValuator( - new TermCoherenceFactor() - ); - - } - - CqDataInt frequencyData = new CqDataInt(new int[] { 10 }); - - CompiledQueryLong titleOnlyLowCountSet = CompiledQuery.just( - new SearchResultKeywordScore("bob", 1, - wordMetadata(Set.of(1), EnumSet.of(WordFlags.Title))) - ).mapToLong(SearchResultKeywordScore::encodedWordMetadata); - - CompiledQueryLong highCountNoTitleSet = CompiledQuery.just( - new SearchResultKeywordScore("bob", 1, - wordMetadata(Set.of(1,3,4,6,7,9,10,11,12,14,15,16), EnumSet.of(WordFlags.TfIdfHigh))) - ).mapToLong(SearchResultKeywordScore::encodedWordMetadata);; - - CompiledQueryLong highCountSubjectSet = CompiledQuery.just( - new SearchResultKeywordScore("bob", 1, - wordMetadata(Set.of(1,3,4,6,7,9,10,11,12,14,15,16), EnumSet.of(WordFlags.TfIdfHigh, WordFlags.Subjects))) - ).mapToLong(SearchResultKeywordScore::encodedWordMetadata);; - - - @Test - void evaluateTerms() { - - when(dict.getTermFreq("bob")).thenReturn(10); - ResultRankingContext context = new ResultRankingContext(100000, - ResultRankingParameters.sensibleDefaults(), - new BitSet(), - new BitSet(), - frequencyData, - frequencyData); - - long docMeta = docMetadata(0, 2010, 5, EnumSet.noneOf(DocumentFlags.class)); - int features = 0; - - double titleOnlyLowCount = valuator.calculateSearchResultValue(titleOnlyLowCountSet, docMeta, features, 10_000, context, null); - double titleLongOnlyLowCount = valuator.calculateSearchResultValue(titleOnlyLowCountSet, docMeta, features, 10_000, context, null); - double highCountNoTitle = valuator.calculateSearchResultValue(highCountNoTitleSet, docMeta, features, 10_000, context, null); - double highCountSubject = valuator.calculateSearchResultValue(highCountSubjectSet, docMeta, features, 10_000, context, null); - - System.out.println(titleOnlyLowCount); - System.out.println(titleLongOnlyLowCount); - System.out.println(highCountNoTitle); - System.out.println(highCountSubject); - } - - private long docMetadata(int topology, - int year, - int quality, - EnumSet flags) { - return new DocumentMetadata(topology, PubDate.toYearByte(year), quality, flags).encode(); - } - - private long wordMetadata(Set positions, Set wordFlags) { - long posBits = positions.stream() - .mapToLong(i -> ((1L << i) & 0xFF_FFFF_FFFF_FFFFL)) - .reduce((a,b) -> a|b) - .orElse(0L); - - return new WordMetadata(posBits, wordFlags).encode(); - } - -} \ No newline at end of file diff --git a/code/index/test/nu/marginalia/ranking/results/factors/TermCoherenceFactorTest.java b/code/index/test/nu/marginalia/ranking/results/factors/TermCoherenceFactorTest.java deleted file mode 100644 index 5d2b47c9..00000000 --- a/code/index/test/nu/marginalia/ranking/results/factors/TermCoherenceFactorTest.java +++ /dev/null @@ -1,107 +0,0 @@ -package nu.marginalia.ranking.results.factors; - -import nu.marginalia.api.searchquery.model.compiled.CompiledQuery; -import nu.marginalia.api.searchquery.model.compiled.aggregate.CompiledQueryAggregates; -import nu.marginalia.api.searchquery.model.results.SearchResultKeywordScore; -import nu.marginalia.bbpc.BrailleBlockPunchCards; -import nu.marginalia.model.idx.WordMetadata; -import org.junit.jupiter.api.Test; - -import java.util.ArrayList; -import java.util.List; - -import static org.junit.jupiter.api.Assertions.*; - -class TermCoherenceFactorTest { - - TermCoherenceFactor termCoherenceFactor = new TermCoherenceFactor(); - @Test - public void testAllBitsSet() { - var allPositionsSet = createSet( - ~0L, - ~0L - ); - - long mask = CompiledQueryAggregates.longBitmaskAggregate( - allPositionsSet, - SearchResultKeywordScore::positions - ); - - assertEquals(1.0, termCoherenceFactor.bitsSetFactor(mask), 0.01); - - assertEquals(1.0, - termCoherenceFactor.calculateOverlap( - allPositionsSet.mapToLong(SearchResultKeywordScore::encodedWordMetadata) - ) - ); - - } - - @Test - public void testNoBitsSet() { - var allPositionsSet = createSet( - 0, 0 - ); - - long mask = CompiledQueryAggregates.longBitmaskAggregate(allPositionsSet, score -> score.positions() & WordMetadata.POSITIONS_MASK); - - assertEquals(0, termCoherenceFactor.bitsSetFactor(mask), 0.01); - - assertEquals(0, termCoherenceFactor.calculateOverlap(allPositionsSet.mapToLong(SearchResultKeywordScore::encodedWordMetadata))); - } - - @Test @SuppressWarnings("unchecked") - public void testLowPosMatches() { - var positions = createSet( - List.of(0, 1, 2, 3), List.of(0, 1, 2, 3) - ); - - long mask = CompiledQueryAggregates.longBitmaskAggregate(positions, score -> score.positions() & WordMetadata.POSITIONS_MASK); - printMask(mask); - - } - - @Test @SuppressWarnings("unchecked") - public void testHiPosMatches() { - var positions = createSet( - List.of(55, 54, 53, 52), List.of(55, 54, 53, 52) - ); - - long mask = CompiledQueryAggregates.longBitmaskAggregate(positions, score -> score.positions() & WordMetadata.POSITIONS_MASK); - printMask(mask); - } - - @Test - public void testBitMatchScaling() { - for (int i = 1; i < 48; i++) { - System.out.println(i + ":" + termCoherenceFactor.bitsSetFactor((1L << i) - 1)); - } - } - - void printMask(long mask) { - System.out.println(BrailleBlockPunchCards.printBits(mask, 48)); - } - - CompiledQuery createSet(List... maskPositions) { - long[] positions = new long[maskPositions.length]; - - for (int i = 0; i < maskPositions.length; i++) { - for (long pos : maskPositions[i]) { - positions[i] |= (1L< createSet(long... positionMasks) { - List keywords = new ArrayList<>(); - - for (int i = 0; i < positionMasks.length; i++) { - keywords.add(new SearchResultKeywordScore("", 0, - new WordMetadata(positionMasks[i] & WordMetadata.POSITIONS_MASK, (byte) 0).encode())); - } - - return CompiledQuery.just(keywords.toArray(SearchResultKeywordScore[]::new)); - } -} \ No newline at end of file diff --git a/code/libraries/coded-sequence/java/nu/marginalia/sequence/EliasGammaCodec.java b/code/libraries/coded-sequence/java/nu/marginalia/sequence/EliasGammaCodec.java index 335d57d8..87b2abd5 100644 --- a/code/libraries/coded-sequence/java/nu/marginalia/sequence/EliasGammaCodec.java +++ b/code/libraries/coded-sequence/java/nu/marginalia/sequence/EliasGammaCodec.java @@ -17,12 +17,13 @@ public class EliasGammaCodec implements IntIterator { private final BitReader reader; int rem = 0; - private int last = 0; + private int last; private int next = 0; - private EliasGammaCodec(ByteBuffer buffer) { + private EliasGammaCodec(ByteBuffer buffer, int zero) { reader = new BitReader(buffer); + last = zero; int bits = reader.takeWhileZero(); if (!reader.hasMore()) { @@ -33,9 +34,24 @@ public class EliasGammaCodec implements IntIterator { } } + public static int readCount(ByteBuffer buffer) { + var reader = new BitReader(buffer); + + if (reader.getCurrentValue() > 0) { + int bits = reader.takeWhileZero(); + return reader.get(bits); + } + else { + return 0; + } + } + /** Decode a sequence of integers from a ByteBuffer using the Elias Gamma code */ public static IntIterator decode(ByteBuffer buffer) { - return new EliasGammaCodec(buffer); + return new EliasGammaCodec(buffer, 0); + } + public static IntIterator decodeWithOffset(ByteBuffer buffer, int offset) { + return new EliasGammaCodec(buffer, offset); } /** Encode a sequence of integers into a ByteBuffer using the Elias Gamma code. diff --git a/code/libraries/coded-sequence/java/nu/marginalia/sequence/GammaCodedSequence.java b/code/libraries/coded-sequence/java/nu/marginalia/sequence/GammaCodedSequence.java index 58ff30d2..a2335fbf 100644 --- a/code/libraries/coded-sequence/java/nu/marginalia/sequence/GammaCodedSequence.java +++ b/code/libraries/coded-sequence/java/nu/marginalia/sequence/GammaCodedSequence.java @@ -16,6 +16,7 @@ import java.util.StringJoiner; * */ public class GammaCodedSequence implements BinarySerializable, Iterable { private final ByteBuffer raw; + int startPos = 0; int startLimit = 0; @@ -43,6 +44,12 @@ public class GammaCodedSequence implements BinarySerializable, Iterable startLimit = bytes.limit(); } + public GammaCodedSequence(ByteBuffer bytes, int startPos, int startLimit) { + this.raw = bytes; + this.startPos = startPos; + this.startLimit = startLimit; + } + public GammaCodedSequence(byte[] bytes) { raw = ByteBuffer.allocate(bytes.length); raw.put(bytes); @@ -72,6 +79,18 @@ public class GammaCodedSequence implements BinarySerializable, Iterable return EliasGammaCodec.decode(raw); } + /** Return an iterator over the sequence with a constant offset applied to each value. + * This is useful for comparing sequences with different offsets, and adds zero + * extra cost to the decoding process which is already based on adding + * relative differences. + * */ + public IntIterator offsetIterator(int offset) { + raw.position(startPos); + raw.limit(startLimit); + + return EliasGammaCodec.decodeWithOffset(raw, offset); + } + public IntList values() { var intItr = iterator(); IntArrayList ret = new IntArrayList(8); @@ -81,18 +100,6 @@ public class GammaCodedSequence implements BinarySerializable, Iterable return ret; } - /** Decode the sequence into an IntList; - * this is a somewhat slow operation, - * iterating over the data directly more performant */ - public IntList decode() { - IntArrayList ret = new IntArrayList(8); - var iter = iterator(); - while (iter.hasNext()) { - ret.add(iter.nextInt()); - } - return ret; - } - public int hashCode() { return raw.hashCode(); } @@ -116,7 +123,11 @@ public class GammaCodedSequence implements BinarySerializable, Iterable return raw; } - public int size() { + public int bufferSize() { return raw.capacity(); } + + public int valueCount() { + return EliasGammaCodec.readCount(buffer()); + } } diff --git a/code/libraries/coded-sequence/java/nu/marginalia/sequence/SequenceOperations.java b/code/libraries/coded-sequence/java/nu/marginalia/sequence/SequenceOperations.java new file mode 100644 index 00000000..7a026862 --- /dev/null +++ b/code/libraries/coded-sequence/java/nu/marginalia/sequence/SequenceOperations.java @@ -0,0 +1,86 @@ +package nu.marginalia.sequence; + +import it.unimi.dsi.fastutil.ints.IntIterator; + +public class SequenceOperations { + + /** Return true if the sequences intersect, false otherwise. + * */ + public static boolean intersectSequences(IntIterator... sequences) { + + if (sequences.length <= 1) + return true; + + // Initialize values and find the maximum value + int[] values = new int[sequences.length]; + + for (int i = 0; i < sequences.length; i++) { + if (sequences[i].hasNext()) + values[i] = sequences[i].nextInt(); + else + return false; + } + + // Intersect the sequences by advancing all values smaller than the maximum seen so far + // until they are equal to the maximum value, or until the end of the sequence is reached + int max = Integer.MIN_VALUE; + int successes = 0; + for (int i = 0; successes < sequences.length; i = (i + 1) % sequences.length) + { + if (values[i] == max) { + successes++; + } else { + successes = 0; + + // Discard values until we reach the maximum value seen so far, + // or until the end of the sequence is reached + while (values[i] < max) { + if (sequences[i].hasNext()) + values[i] = sequences[i].nextInt(); + else + return false; + } + + // Update the maximum value, if necessary + max = Math.max(max, values[i]); + } + } + + return true; + } + + /** Return the minimum word distance between two sequences, or a negative value if either sequence is empty. + * */ + public static int minDistance(IntIterator seqA, IntIterator seqB) + { + int minDistance = Integer.MAX_VALUE; + + if (!seqA.hasNext() || !seqB.hasNext()) + return -1; + + int a = seqA.nextInt(); + int b = seqB.nextInt(); + + while (true) { + int distance = Math.abs(a - b); + if (distance < minDistance) + minDistance = distance; + + if (a <= b) { + if (seqA.hasNext()) { + a = seqA.nextInt(); + } else { + break; + } + } else { + if (seqB.hasNext()) { + b = seqB.nextInt(); + } else { + break; + } + } + } + + return minDistance; + } +} diff --git a/code/libraries/coded-sequence/java/nu/marginalia/sequence/io/BitReader.java b/code/libraries/coded-sequence/java/nu/marginalia/sequence/io/BitReader.java index 08979f0d..61125d2e 100644 --- a/code/libraries/coded-sequence/java/nu/marginalia/sequence/io/BitReader.java +++ b/code/libraries/coded-sequence/java/nu/marginalia/sequence/io/BitReader.java @@ -20,6 +20,10 @@ public class BitReader { this.currentValue = 0; } + public long getCurrentValue() { + return currentValue; + } + /** Read the next bit from the buffer */ public boolean getBit() { if (bitPosition <= 0) { diff --git a/code/libraries/coded-sequence/test/nu/marginalia/sequence/SequenceOperationsTest.java b/code/libraries/coded-sequence/test/nu/marginalia/sequence/SequenceOperationsTest.java new file mode 100644 index 00000000..dbae6f29 --- /dev/null +++ b/code/libraries/coded-sequence/test/nu/marginalia/sequence/SequenceOperationsTest.java @@ -0,0 +1,75 @@ +package nu.marginalia.sequence; + +import it.unimi.dsi.fastutil.ints.IntIterator; +import org.junit.jupiter.api.Test; + +import java.nio.ByteBuffer; + +import static org.junit.jupiter.api.Assertions.*; + +class SequenceOperationsTest { + + @Test + void intersectSequencesSingle() { + ByteBuffer wa = ByteBuffer.allocate(1024); + GammaCodedSequence seq1 = GammaCodedSequence.generate(wa, 1); + + assertTrue(SequenceOperations.intersectSequences(seq1.iterator())); + } + + @Test + void intersectSequencesTrivialMatch() { + ByteBuffer wa = ByteBuffer.allocate(1024); + GammaCodedSequence seq1 = GammaCodedSequence.generate(wa, 1); + GammaCodedSequence seq2 = GammaCodedSequence.generate(wa, 1); + + assertTrue(SequenceOperations.intersectSequences(seq1.iterator(), seq2.iterator())); + } + + @Test + void intersectSequencesTrivialMismatch() { + ByteBuffer wa = ByteBuffer.allocate(1024); + GammaCodedSequence seq1 = GammaCodedSequence.generate(wa, 1); + GammaCodedSequence seq2 = GammaCodedSequence.generate(wa, 2); + + assertFalse(SequenceOperations.intersectSequences(seq1.iterator(), seq2.iterator())); + } + + @Test + void intersectSequencesOffsetMatch() { + ByteBuffer wa = ByteBuffer.allocate(1024); + GammaCodedSequence seq1 = GammaCodedSequence.generate(wa, 1); + GammaCodedSequence seq2 = GammaCodedSequence.generate(wa, 3); + + assertTrue(SequenceOperations.intersectSequences(seq1.offsetIterator(0), seq2.offsetIterator(-2))); + } + + @Test + void intersectSequencesDeepMatch() { + ByteBuffer wa = ByteBuffer.allocate(1024); + GammaCodedSequence seq1 = GammaCodedSequence.generate(wa, 1, 3, 4, 7, 8, 9, 11); + GammaCodedSequence seq2 = GammaCodedSequence.generate(wa, 2, 5, 8, 14); + + assertTrue(SequenceOperations.intersectSequences(seq1.iterator(), seq2.iterator())); + } + + @Test + void intersectSequencesDeepMatch3() { + ByteBuffer wa = ByteBuffer.allocate(1024); + GammaCodedSequence seq1 = GammaCodedSequence.generate(wa, 1, 3, 4, 7, 8, 9, 11); + GammaCodedSequence seq2 = GammaCodedSequence.generate(wa, 2, 5, 8, 14); + GammaCodedSequence seq3 = GammaCodedSequence.generate(wa, 1, 5, 8, 9); + + assertTrue(SequenceOperations.intersectSequences(seq1.iterator(), seq2.iterator(), seq3.iterator())); + } + + @Test + void intersectSequencesDeepMismatch() { + ByteBuffer wa = ByteBuffer.allocate(1024); + GammaCodedSequence seq1 = GammaCodedSequence.generate(wa, 1, 3, 4, 7, 8, 9, 11); + GammaCodedSequence seq2 = GammaCodedSequence.generate(wa, 2, 5, 14); + + assertFalse(SequenceOperations.intersectSequences(seq1.iterator(), seq2.iterator())); + } + +} \ No newline at end of file diff --git a/code/process-models/processed-data/java/nu/marginalia/model/processed/DocumentRecordKeywordsProjection.java b/code/process-models/processed-data/java/nu/marginalia/model/processed/DocumentRecordKeywordsProjection.java index c981f0da..5e98f96c 100644 --- a/code/process-models/processed-data/java/nu/marginalia/model/processed/DocumentRecordKeywordsProjection.java +++ b/code/process-models/processed-data/java/nu/marginalia/model/processed/DocumentRecordKeywordsProjection.java @@ -26,6 +26,8 @@ public class DocumentRecordKeywordsProjection { public int htmlFeatures; public long documentMetadata; + public int length; + public List words; public TLongList metas; public List positions; @@ -39,13 +41,14 @@ public class DocumentRecordKeywordsProjection { } public static Collection requiredColumns() { - return List.of("domain", "ordinal", "htmlFeatures", "word", "wordMeta", "documentMetadata"); + return List.of("domain", "ordinal", "htmlFeatures", "word", "wordMeta", "documentMetadata", "length"); } @SneakyThrows public DocumentRecordKeywordsProjection add(String heading, Object value) { switch (heading) { case "domain" -> domain = (String) value; + case "length" -> length = (Integer) value; case "ordinal" -> ordinal = (Integer) value; case "htmlFeatures" -> htmlFeatures = (Integer) value; case "documentMetadata" -> documentMetadata = (Long) value; diff --git a/code/processes/loading-process/java/nu/marginalia/loading/LoaderIndexJournalWriter.java b/code/processes/loading-process/java/nu/marginalia/loading/LoaderIndexJournalWriter.java index 9c87bab7..f523f8e7 100644 --- a/code/processes/loading-process/java/nu/marginalia/loading/LoaderIndexJournalWriter.java +++ b/code/processes/loading-process/java/nu/marginalia/loading/LoaderIndexJournalWriter.java @@ -6,12 +6,10 @@ import lombok.SneakyThrows; import nu.marginalia.IndexLocations; import nu.marginalia.index.journal.model.IndexJournalEntryData; import nu.marginalia.storage.FileStorageService; -import nu.marginalia.hash.MurmurHash3_128; import nu.marginalia.index.journal.model.IndexJournalEntryHeader; import nu.marginalia.index.journal.writer.IndexJournalWriterPagingImpl; import nu.marginalia.index.journal.writer.IndexJournalWriter; import nu.marginalia.keyword.model.DocumentKeywords; -import nu.marginalia.model.idx.DocumentMetadata; import nu.marginalia.index.journal.IndexJournalFileNames; import org.slf4j.Logger; import org.slf4j.LoggerFactory; @@ -41,18 +39,11 @@ public class LoaderIndexJournalWriter { indexWriter = new IndexJournalWriterPagingImpl(indexArea); } - public void putWords(long combinedId, - int features, - DocumentMetadata metadata, - DocumentKeywords wordSet) { - - putWords(combinedId, features, metadata.encode(), wordSet); - } - @SneakyThrows public void putWords(long combinedId, int features, long metadata, + int length, DocumentKeywords wordSet) { if (wordSet.isEmpty()) { @@ -65,7 +56,7 @@ public class LoaderIndexJournalWriter { return; } - var header = new IndexJournalEntryHeader(combinedId, features, metadata); + var header = new IndexJournalEntryHeader(combinedId, features, length, metadata); var data = new IndexJournalEntryData(wordSet.keywords, wordSet.metadata, wordSet.positions); indexWriter.put(header, data); diff --git a/code/processes/loading-process/java/nu/marginalia/loading/documents/KeywordLoaderService.java b/code/processes/loading-process/java/nu/marginalia/loading/documents/KeywordLoaderService.java index f69a891d..ab43bdd7 100644 --- a/code/processes/loading-process/java/nu/marginalia/loading/documents/KeywordLoaderService.java +++ b/code/processes/loading-process/java/nu/marginalia/loading/documents/KeywordLoaderService.java @@ -75,6 +75,7 @@ public class KeywordLoaderService { writer.putWords(combinedId, projection.htmlFeatures, projection.documentMetadata, + projection.length, words); } } \ No newline at end of file diff --git a/code/services-application/search-service/test/nu/marginalia/search/paperdoll/SearchServicePaperDoll.java b/code/services-application/search-service/test/nu/marginalia/search/paperdoll/SearchServicePaperDoll.java index 2a2cc003..be3fe0b7 100644 --- a/code/services-application/search-service/test/nu/marginalia/search/paperdoll/SearchServicePaperDoll.java +++ b/code/services-application/search-service/test/nu/marginalia/search/paperdoll/SearchServicePaperDoll.java @@ -91,7 +91,7 @@ public class SearchServicePaperDoll extends AbstractModule { long positions) { results.add(new DecoratedSearchResultItem( - new SearchResultItem(url.hashCode(), 2, 3, false), + new SearchResultItem(url.hashCode(), 2, 3), new EdgeUrl(url), title, description,