From 9894f374123dfdbb6789ff304790dc4c1ac05811 Mon Sep 17 00:00:00 2001 From: Viktor Lofgren Date: Thu, 24 Aug 2023 16:44:27 +0200 Subject: [PATCH] (index) Implement new URL ID coding scheme. Also refactor along the way. Really needs an additional pass, these tests are very hairy. --- .../model/results/SearchResultItem.java | 33 ++++++--- .../nu/marginalia/model/id/UrlIdCodec.java | 22 +----- .../marginalia/model/id/UrlIdCodecTest.java | 19 +++++ .../nu/marginalia/ranking/DomainRankings.java | 4 + .../index/forward/ForwardIndexConverter.java | 16 ++-- .../index/forward/ForwardIndexParameters.java | 7 +- .../index/forward/ForwardIndexReader.java | 14 ++-- .../forward/ParamMatchingQueryFilter.java | 10 ++- .../forward/ForwardIndexConverterTest.java | 5 +- .../journal/reader/IndexJournalReadEntry.java | 7 +- .../journal/reader/IndexJournalReader.java | 4 +- ...ndexJournalReaderSingleCompressedFile.java | 18 +---- .../index/journal/IndexJournalTest.java | 52 +++++-------- .../marginalia/index/searchset/SearchSet.java | 4 +- .../index/full/ReverseIndexFullConverter.java | 19 +---- .../index/full/ReverseIndexFullReader.java | 15 +++- .../ReverseIndexPriorityConverter.java | 19 +---- .../ReverseIndexFullConverterTest.java | 12 ++- .../ReverseIndexFullConverterTest2.java | 5 +- .../ReverseIndexPriorityConverterTest2.java | 5 +- .../marginalia/index/index/SearchIndex.java | 10 +-- .../index/index/SearchIndexReader.java | 4 - .../index/results/IndexMetadataService.java | 74 +++++++++---------- .../index/results/IndexResultValuator.java | 34 +++++---- .../index/svc/IndexQueryService.java | 4 +- .../index/svc/searchset/RankingSearchSet.java | 5 +- .../index/svc/searchset/SearchSetAny.java | 2 +- .../svc/IndexQueryServiceIntegrationTest.java | 31 +++++--- 28 files changed, 227 insertions(+), 227 deletions(-) diff --git a/code/api/index-api/src/main/java/nu/marginalia/index/client/model/results/SearchResultItem.java b/code/api/index-api/src/main/java/nu/marginalia/index/client/model/results/SearchResultItem.java index 9890b3aa..3fbbb128 100644 --- a/code/api/index-api/src/main/java/nu/marginalia/index/client/model/results/SearchResultItem.java +++ b/code/api/index-api/src/main/java/nu/marginalia/index/client/model/results/SearchResultItem.java @@ -4,13 +4,15 @@ import lombok.AllArgsConstructor; import lombok.Getter; import nu.marginalia.model.EdgeUrl; import nu.marginalia.model.id.EdgeId; +import nu.marginalia.model.id.UrlIdCodec; +import org.jetbrains.annotations.NotNull; import java.util.ArrayList; import java.util.List; /** Represents a document matching a search query */ @AllArgsConstructor @Getter -public class SearchResultItem { +public class SearchResultItem implements Comparable { /** Encoded ID that contains both the URL id and its ranking */ public final long combinedId; @@ -25,15 +27,22 @@ public class SearchResultItem { this.keywordScores = new ArrayList<>(16); } + @Deprecated public EdgeId getUrlId() { return new EdgeId<>(getUrlIdInt()); } + public long getDocumentId() { + return UrlIdCodec.removeRank(combinedId); + } + + @Deprecated public int getUrlIdInt() { return (int)(combinedId & 0xFFFF_FFFFL); } + public int getRanking() { - return (int)(combinedId >>> 32); + return UrlIdCodec.getRank(combinedId); } /* Used for evaluation */ @@ -45,16 +54,12 @@ public class SearchResultItem { return scoreValue; } - private transient int domainId = Integer.MIN_VALUE; - public void setDomainId(int domainId) { - this.domainId = domainId; - } public int getDomainId() { - return this.domainId; + return UrlIdCodec.getDomainId(this.combinedId); } public int hashCode() { - return getUrlIdInt(); + return Long.hashCode(combinedId); } public String toString() { @@ -67,7 +72,7 @@ public class SearchResultItem { if (other == this) return true; if (other instanceof SearchResultItem o) { - return o.getUrlIdInt() == getUrlIdInt(); + return o.getDocumentId() == getDocumentId(); } return false; } @@ -81,4 +86,14 @@ public class SearchResultItem { return domainId; } + + @Override + public int compareTo(@NotNull SearchResultItem o) { + // this looks like a bug, but we actually want this in a reversed order + int diff = o.getScore().compareTo(getScore()); + if (diff != 0) + return diff; + + return Long.compare(this.combinedId, o.combinedId); + } } diff --git a/code/common/model/src/main/java/nu/marginalia/model/id/UrlIdCodec.java b/code/common/model/src/main/java/nu/marginalia/model/id/UrlIdCodec.java index 86c8deac..26ac847e 100644 --- a/code/common/model/src/main/java/nu/marginalia/model/id/UrlIdCodec.java +++ b/code/common/model/src/main/java/nu/marginalia/model/id/UrlIdCodec.java @@ -29,33 +29,17 @@ package nu.marginalia.model.id; * */ public class UrlIdCodec { - private static final long RANK_MASK = 0x8600_0000_0000_0000L; + private static final long RANK_MASK = 0xFE00_0000_0000_0000L; private static final int DOCORD_MASK = 0x03FF_FFFF; /** Encode a URL id without a ranking element */ public static long encodeId(int domainId, int documentOrdinal) { - domainId &= 0x7FFF_FFFFL; + domainId &= 0x7FFF_FFFF; documentOrdinal &= 0x03FF_FFFF; return ((long) domainId << 26) | documentOrdinal; } - /** Encode a URL id with the optional ranking part - * - * @param rank [0,1] the importance of the domain, low is good - * @param domainId - * @param documentOrdinal - * @return - */ - public static long encodeIdWithRank(float rank, int domainId, int documentOrdinal) { - long rankPart = (int)(rank * (1<<6)); - - if (rankPart >= 64) rankPart = 63; - if (rankPart < 0) rankPart = 0; - - return encodeId(domainId, documentOrdinal) | (rankPart << 57); - } - /** Add a ranking element to an existing combined URL id. * * @param rank [0,1] the importance of the domain, low is good @@ -88,7 +72,7 @@ public class UrlIdCodec { /** Mask out the ranking element from this URL id */ public static long removeRank(long combinedId) { - return combinedId & (~RANK_MASK); + return combinedId & ~RANK_MASK; } } diff --git a/code/common/model/src/test/java/nu/marginalia/model/id/UrlIdCodecTest.java b/code/common/model/src/test/java/nu/marginalia/model/id/UrlIdCodecTest.java index 10fda63b..727b983b 100644 --- a/code/common/model/src/test/java/nu/marginalia/model/id/UrlIdCodecTest.java +++ b/code/common/model/src/test/java/nu/marginalia/model/id/UrlIdCodecTest.java @@ -27,6 +27,25 @@ class UrlIdCodecTest { assertEquals(0, UrlIdCodec.getDocumentOrdinal(encoded)); } + @Test + public void testRankBoundsAdd() { + long encoded = UrlIdCodec.encodeId(0, 0); + encoded = UrlIdCodec.addRank(1.f, encoded); + assertEquals(0, UrlIdCodec.getDomainId(encoded)); + assertEquals(63, UrlIdCodec.getRank(encoded)); + assertEquals(0, UrlIdCodec.getDocumentOrdinal(encoded)); + } + + @Test + public void testRemoveRank() { + long encoded = UrlIdCodec.encodeId(0x7FFF_FFFF, ~0); + encoded = UrlIdCodec.addRank(1.f, encoded); + encoded = UrlIdCodec.removeRank(encoded); + assertEquals(0x7FFF_FFFFL, UrlIdCodec.getDomainId(encoded)); + assertEquals(0, UrlIdCodec.getRank(encoded)); + assertEquals(0x03FF_FFFF, UrlIdCodec.getDocumentOrdinal(encoded)); + } + @Test public void testRankBoundsNeg() { long encoded = UrlIdCodec.encodeIdWithRank(-1.0f, 0, 0); diff --git a/code/features-index/domain-ranking/src/main/java/nu/marginalia/ranking/DomainRankings.java b/code/features-index/domain-ranking/src/main/java/nu/marginalia/ranking/DomainRankings.java index b408f980..f9c1dbcc 100644 --- a/code/features-index/domain-ranking/src/main/java/nu/marginalia/ranking/DomainRankings.java +++ b/code/features-index/domain-ranking/src/main/java/nu/marginalia/ranking/DomainRankings.java @@ -37,6 +37,10 @@ public class DomainRankings { return rankings.getOrDefault(domainId, (short) MAX_RANK_VALUE); } + public float getSortRanking(int domainId) { + return rankings.getOrDefault(domainId, (short) MAX_RANK_VALUE) / (float) MAX_RANK_VALUE; + } + public int size() { return rankings.size(); } diff --git a/code/features-index/index-forward/src/main/java/nu/marginalia/index/forward/ForwardIndexConverter.java b/code/features-index/index-forward/src/main/java/nu/marginalia/index/forward/ForwardIndexConverter.java index 4aa083e3..a5a750fe 100644 --- a/code/features-index/index-forward/src/main/java/nu/marginalia/index/forward/ForwardIndexConverter.java +++ b/code/features-index/index-forward/src/main/java/nu/marginalia/index/forward/ForwardIndexConverter.java @@ -10,6 +10,8 @@ import nu.marginalia.ranking.DomainRankings; import nu.marginalia.service.control.ServiceHeartbeat; import org.roaringbitmap.IntConsumer; import org.roaringbitmap.RoaringBitmap; +import org.roaringbitmap.longlong.LongConsumer; +import org.roaringbitmap.longlong.Roaring64Bitmap; import org.slf4j.Logger; import org.slf4j.LoggerFactory; @@ -83,12 +85,11 @@ public class ForwardIndexConverter { LongArray docFileData = LongArray.mmapForWriting(outputFileDocsData, ForwardIndexParameters.ENTRY_SIZE * docsFileId.size()); journalReader.forEach(entry -> { - long entryOffset = (long) ForwardIndexParameters.ENTRY_SIZE * docIdToIdx.get(entry.urlId()); + long entryOffset = (long) ForwardIndexParameters.ENTRY_SIZE * docIdToIdx.get(entry.docId()); int ranking = domainRankings.getRanking(entry.domainId()); long meta = DocumentMetadata.encodeRank(entry.docMeta(), ranking); - docFileData.set(entryOffset + ForwardIndexParameters.DOMAIN_OFFSET, entry.domainId()); docFileData.set(entryOffset + ForwardIndexParameters.METADATA_OFFSET, meta); docFileData.set(entryOffset + ForwardIndexParameters.FEATURES_OFFSET, entry.header.documentFeatures()); }); @@ -109,17 +110,18 @@ public class ForwardIndexConverter { } private LongArray getDocIds(Path outputFileDocs, IndexJournalReader journalReader) throws IOException { - RoaringBitmap rbm = new RoaringBitmap(); - journalReader.forEachUrlId(rbm::add); + Roaring64Bitmap rbm = new Roaring64Bitmap(); + journalReader.forEachDocId(rbm::add); - LongArray ret = LongArray.mmapForWriting(outputFileDocs, rbm.getCardinality()); - rbm.forEach(new IntConsumer() { + LongArray ret = LongArray.mmapForWriting(outputFileDocs, rbm.getIntCardinality()); + rbm.forEach(new LongConsumer() { int offset; @Override - public void accept(int value) { + public void accept(long value) { ret.set(offset++, value); } }); + return ret; } diff --git a/code/features-index/index-forward/src/main/java/nu/marginalia/index/forward/ForwardIndexParameters.java b/code/features-index/index-forward/src/main/java/nu/marginalia/index/forward/ForwardIndexParameters.java index f9c17a71..0b306050 100644 --- a/code/features-index/index-forward/src/main/java/nu/marginalia/index/forward/ForwardIndexParameters.java +++ b/code/features-index/index-forward/src/main/java/nu/marginalia/index/forward/ForwardIndexParameters.java @@ -1,9 +1,8 @@ package nu.marginalia.index.forward; class ForwardIndexParameters { - public static final int ENTRY_SIZE = 3; - public static final int DOMAIN_OFFSET = 0; - public static final int METADATA_OFFSET = 1; - public static final int FEATURES_OFFSET = 2; + public static final int ENTRY_SIZE = 2; + public static final int METADATA_OFFSET = 0; + public static final int FEATURES_OFFSET = 1; } diff --git a/code/features-index/index-forward/src/main/java/nu/marginalia/index/forward/ForwardIndexReader.java b/code/features-index/index-forward/src/main/java/nu/marginalia/index/forward/ForwardIndexReader.java index 3bdf14c8..dc888aa9 100644 --- a/code/features-index/index-forward/src/main/java/nu/marginalia/index/forward/ForwardIndexReader.java +++ b/code/features-index/index-forward/src/main/java/nu/marginalia/index/forward/ForwardIndexReader.java @@ -3,6 +3,7 @@ package nu.marginalia.index.forward; import com.upserve.uppend.blobs.NativeIO; import gnu.trove.map.hash.TLongIntHashMap; import nu.marginalia.array.LongArray; +import nu.marginalia.model.id.UrlIdCodec; import org.slf4j.Logger; import org.slf4j.LoggerFactory; @@ -71,6 +72,8 @@ public class ForwardIndexReader { } public long getDocMeta(long docId) { + assert UrlIdCodec.getRank(docId) == 0 : "Forward Index Reader fed dirty reverse index id"; + long offset = idxForDoc(docId); if (offset < 0) return 0; @@ -78,20 +81,17 @@ public class ForwardIndexReader { } public int getHtmlFeatures(long docId) { + assert UrlIdCodec.getRank(docId) == 0 : "Forward Index Reader fed dirty reverse index id"; + long offset = idxForDoc(docId); if (offset < 0) return 0; return (int) data.get(ENTRY_SIZE * offset + FEATURES_OFFSET); } - public int getDomainId(long docId) { - long offset = idxForDoc(docId); - if (offset < 0) return 0; - - return Math.max(0, (int) data.get(ENTRY_SIZE * offset + DOMAIN_OFFSET)); - } - private int idxForDoc(long docId) { + assert UrlIdCodec.getRank(docId) == 0 : "Forward Index Reader fed dirty reverse index id"; + return idToOffset.get(docId); } diff --git a/code/features-index/index-forward/src/main/java/nu/marginalia/index/forward/ParamMatchingQueryFilter.java b/code/features-index/index-forward/src/main/java/nu/marginalia/index/forward/ParamMatchingQueryFilter.java index 8d22516b..d7e6a9b3 100644 --- a/code/features-index/index-forward/src/main/java/nu/marginalia/index/forward/ParamMatchingQueryFilter.java +++ b/code/features-index/index-forward/src/main/java/nu/marginalia/index/forward/ParamMatchingQueryFilter.java @@ -1,5 +1,6 @@ package nu.marginalia.index.forward; +import nu.marginalia.model.id.UrlIdCodec; import nu.marginalia.model.idx.DocumentMetadata; import nu.marginalia.index.query.limit.SpecificationLimitType; import nu.marginalia.index.query.IndexQueryParams; @@ -15,10 +16,11 @@ public class ParamMatchingQueryFilter implements QueryFilterStepIf { } @Override - public boolean test(long docId) { - int urlId = (int) (docId & 0xFFFF_FFFFL); - int domainId = forwardIndexReader.getDomainId(urlId); - long meta = forwardIndexReader.getDocMeta(urlId); + public boolean test(long combinedId) { + long docId = UrlIdCodec.removeRank(combinedId); + int domainId = UrlIdCodec.getDomainId(docId); + + long meta = forwardIndexReader.getDocMeta(docId); if (!validateDomain(domainId, meta)) { return false; diff --git a/code/features-index/index-forward/src/test/java/nu/marginalia/index/forward/ForwardIndexConverterTest.java b/code/features-index/index-forward/src/test/java/nu/marginalia/index/forward/ForwardIndexConverterTest.java index a801021d..efa2ee92 100644 --- a/code/features-index/index-forward/src/test/java/nu/marginalia/index/forward/ForwardIndexConverterTest.java +++ b/code/features-index/index-forward/src/test/java/nu/marginalia/index/forward/ForwardIndexConverterTest.java @@ -113,8 +113,9 @@ class ForwardIndexConverterTest { var forwardReader = new ForwardIndexReader(docsFileId, docsFileData); for (int i = 36; i < workSetSize; i++) { - assertEquals(0x00FF000000000000L | (i % 5), forwardReader.getDocMeta(i)); - assertEquals(i/20, forwardReader.getDomainId(i)); + long docId = createId(i, i/20); + assertEquals(0x00FF000000000000L | (i % 5), forwardReader.getDocMeta(docId)); + assertEquals(i/20, UrlIdCodec.getDomainId(docId)); } } diff --git a/code/features-index/index-journal/src/main/java/nu.marginalia.index/journal/reader/IndexJournalReadEntry.java b/code/features-index/index-journal/src/main/java/nu.marginalia.index/journal/reader/IndexJournalReadEntry.java index 00ba3b88..fa220fec 100644 --- a/code/features-index/index-journal/src/main/java/nu.marginalia.index/journal/reader/IndexJournalReadEntry.java +++ b/code/features-index/index-journal/src/main/java/nu.marginalia.index/journal/reader/IndexJournalReadEntry.java @@ -2,6 +2,7 @@ package nu.marginalia.index.journal.reader; import nu.marginalia.index.journal.model.IndexJournalEntryData; import nu.marginalia.index.journal.model.IndexJournalEntryHeader; +import nu.marginalia.model.id.UrlIdCodec; import java.io.DataInputStream; import java.io.IOException; @@ -51,11 +52,7 @@ public class IndexJournalReadEntry { } public int domainId() { - return (int) (docId() >>> 32L); - } - - public int urlId() { - return (int) (docId() & 0xFFFF_FFFFL); + return UrlIdCodec.getDomainId(docId()); } public IndexJournalEntryData readEntry() { diff --git a/code/features-index/index-journal/src/main/java/nu.marginalia.index/journal/reader/IndexJournalReader.java b/code/features-index/index-journal/src/main/java/nu.marginalia.index/journal/reader/IndexJournalReader.java index 1467c500..aaeb628f 100644 --- a/code/features-index/index-journal/src/main/java/nu.marginalia.index/journal/reader/IndexJournalReader.java +++ b/code/features-index/index-journal/src/main/java/nu.marginalia.index/journal/reader/IndexJournalReader.java @@ -8,6 +8,7 @@ import org.jetbrains.annotations.NotNull; import java.io.IOException; import java.util.Iterator; import java.util.function.IntConsumer; +import java.util.function.LongConsumer; public interface IndexJournalReader extends Iterable { int FILE_HEADER_SIZE_LONGS = 2; @@ -19,13 +20,12 @@ public interface IndexJournalReader extends Iterable { void forEachWordId(IntConsumer consumer); - void forEachUrlIdWordId(BiIntConsumer consumer); void forEachDocIdWordId(LongIntConsumer consumer); void forEachDocIdRecord(LongObjectConsumer consumer); - void forEachUrlId(IntConsumer consumer); + void forEachDocId(LongConsumer consumer); @NotNull @Override diff --git a/code/features-index/index-journal/src/main/java/nu.marginalia.index/journal/reader/IndexJournalReaderSingleCompressedFile.java b/code/features-index/index-journal/src/main/java/nu.marginalia.index/journal/reader/IndexJournalReaderSingleCompressedFile.java index c64bccf5..c7a4d6c7 100644 --- a/code/features-index/index-journal/src/main/java/nu.marginalia.index/journal/reader/IndexJournalReaderSingleCompressedFile.java +++ b/code/features-index/index-journal/src/main/java/nu.marginalia.index/journal/reader/IndexJournalReaderSingleCompressedFile.java @@ -14,6 +14,7 @@ import java.nio.file.Path; import java.nio.file.StandardOpenOption; import java.util.Iterator; import java.util.function.IntConsumer; +import java.util.function.LongConsumer; import java.util.function.Predicate; public class IndexJournalReaderSingleCompressedFile implements IndexJournalReader { @@ -115,19 +116,6 @@ public class IndexJournalReaderSingleCompressedFile implements IndexJournalReade } } - @Override - public void forEachUrlIdWordId(BiIntConsumer consumer) { - for (var entry : this) { - var data = entry.readEntry(); - - for (var post : data) { - if (filter(entry, post)) { - consumer.accept(entry.urlId(), post.wordId()); - } - } - } - } - @Override public void forEachDocIdWordId(LongIntConsumer consumer) { for (var entry : this) { @@ -154,10 +142,10 @@ public class IndexJournalReaderSingleCompressedFile implements IndexJournalReade } } @Override - public void forEachUrlId(IntConsumer consumer) { + public void forEachDocId(LongConsumer consumer) { for (var entry : this) { if (filter(entry)) { - consumer.accept(entry.urlId()); + consumer.accept(entry.docId()); } } } diff --git a/code/features-index/index-journal/src/test/java/nu/marginalia/index/journal/IndexJournalTest.java b/code/features-index/index-journal/src/test/java/nu/marginalia/index/journal/IndexJournalTest.java index 9cb96781..dc5c006a 100644 --- a/code/features-index/index-journal/src/test/java/nu/marginalia/index/journal/IndexJournalTest.java +++ b/code/features-index/index-journal/src/test/java/nu/marginalia/index/journal/IndexJournalTest.java @@ -6,6 +6,7 @@ import nu.marginalia.index.journal.reader.IndexJournalReader; import nu.marginalia.index.journal.reader.IndexJournalReaderSingleCompressedFile; import nu.marginalia.index.journal.writer.IndexJournalWriterImpl; import nu.marginalia.lexicon.KeywordLexicon; +import nu.marginalia.model.id.UrlIdCodec; import org.apache.commons.lang3.tuple.Pair; import org.junit.jupiter.api.AfterEach; import org.junit.jupiter.api.BeforeEach; @@ -25,6 +26,9 @@ public class IndexJournalTest { KeywordLexicon lexicon; IndexJournalReader reader; + long firstDocId = UrlIdCodec.encodeId(44, 10); + long secondDocId = UrlIdCodec.encodeId(43, 15); + @BeforeEach public void setUp() throws IOException { tempFile = Files.createTempFile(getClass().getSimpleName(), ".dat"); @@ -65,11 +69,11 @@ public class IndexJournalTest { } @Test - public void forEachUrlId() { - List expected = List.of(10, 15); - List actual = new ArrayList<>(); + public void forEachDocId() { + List expected = List.of(firstDocId, secondDocId); + List actual = new ArrayList<>(); - reader.forEachUrlId(actual::add); + reader.forEachDocId(actual::add); assertEquals(expected, actual); } @@ -82,31 +86,15 @@ public class IndexJournalTest { assertEquals(expected, actual); } - - @Test - public void forEachUrlIdWordId() { - List> expected = List.of( - Pair.of(10, 1), - Pair.of(10, 2), - Pair.of(10, 3), - Pair.of(10, 5), - Pair.of(15, 5), - Pair.of(15, 6)); - List> actual = new ArrayList<>(); - - reader.forEachUrlIdWordId((url, word) -> actual.add(Pair.of(url, word))); - assertEquals(expected, actual); - } - @Test public void forEachDocIdWordId() { List> expected = List.of( - Pair.of(10L | (44L << 32), 1), - Pair.of(10L | (44L << 32), 2), - Pair.of(10L | (44L << 32), 3), - Pair.of(10L | (44L << 32), 5), - Pair.of(15L | (43L << 32), 5), - Pair.of(15L | (43L << 32), 6)); + Pair.of(firstDocId, 1), + Pair.of(firstDocId, 2), + Pair.of(firstDocId, 3), + Pair.of(firstDocId, 5), + Pair.of(secondDocId, 5), + Pair.of(secondDocId, 6)); List> actual = new ArrayList<>(); reader.forEachDocIdWordId((url, word) -> actual.add(Pair.of(url, word))); @@ -116,12 +104,12 @@ public class IndexJournalTest { @Test public void forEachDocIdRecord() { List> expected = List.of( - Pair.of(10L | (44L << 32), new IndexJournalEntryData.Record(1, 2)), - Pair.of(10L | (44L << 32), new IndexJournalEntryData.Record(2, 3)), - Pair.of(10L | (44L << 32), new IndexJournalEntryData.Record(3, 4)), - Pair.of(10L | (44L << 32), new IndexJournalEntryData.Record(5, 6)), - Pair.of(15L | (43L << 32), new IndexJournalEntryData.Record(5, 5)), - Pair.of(15L | (43L << 32), new IndexJournalEntryData.Record(6, 6)) + Pair.of(firstDocId, new IndexJournalEntryData.Record(1, 2)), + Pair.of(firstDocId, new IndexJournalEntryData.Record(2, 3)), + Pair.of(firstDocId, new IndexJournalEntryData.Record(3, 4)), + Pair.of(firstDocId, new IndexJournalEntryData.Record(5, 6)), + Pair.of(secondDocId, new IndexJournalEntryData.Record(5, 5)), + Pair.of(secondDocId, new IndexJournalEntryData.Record(6, 6)) ); List> actual = new ArrayList<>(); diff --git a/code/features-index/index-query/src/main/java/nu/marginalia/index/searchset/SearchSet.java b/code/features-index/index-query/src/main/java/nu/marginalia/index/searchset/SearchSet.java index 529950e7..8fba1801 100644 --- a/code/features-index/index-query/src/main/java/nu/marginalia/index/searchset/SearchSet.java +++ b/code/features-index/index-query/src/main/java/nu/marginalia/index/searchset/SearchSet.java @@ -3,10 +3,10 @@ package nu.marginalia.index.searchset; public interface SearchSet { /** - * Returns true if the given urlId is contained in the set + * Returns true if the given domainId is contained in the set * or if the documentMetadata vibes with the set * */ - boolean contains(int urlId, long documentMetadata); + boolean contains(int domainId, long documentMetadata); } diff --git a/code/features-index/index-reverse/src/main/java/nu/marginalia/index/full/ReverseIndexFullConverter.java b/code/features-index/index-reverse/src/main/java/nu/marginalia/index/full/ReverseIndexFullConverter.java index f2e3f91b..cc44c35b 100644 --- a/code/features-index/index-reverse/src/main/java/nu/marginalia/index/full/ReverseIndexFullConverter.java +++ b/code/features-index/index-reverse/src/main/java/nu/marginalia/index/full/ReverseIndexFullConverter.java @@ -7,6 +7,7 @@ import nu.marginalia.index.construction.IndexSizeEstimator; import nu.marginalia.index.journal.model.IndexJournalEntryData; import nu.marginalia.index.journal.model.IndexJournalStatistics; import nu.marginalia.index.journal.reader.IndexJournalReader; +import nu.marginalia.model.id.UrlIdCodec; import nu.marginalia.ranking.DomainRankings; import nu.marginalia.rwf.RandomWriteFunnel; import nu.marginalia.array.IntArray; @@ -179,21 +180,9 @@ public class ReverseIndexFullConverter { @SneakyThrows @Override public void accept(long docId, IndexJournalEntryData.Record record) { - - /* Encode the ID as - * - * 32 bits 32 bits - * [ ranking | url-id ] - * - * in order to get low-ranking documents to be considered first - * when sorting the items. - */ - - int domainId = (int) (docId >>> 32); - long rankingId = (long) domainRankings.getRanking(domainId) << 32; - - int urlId = (int) (docId & 0xFFFF_FFFFL); - long rankEncodedId = rankingId | urlId; + int domainId = UrlIdCodec.getDomainId(docId); + float rankingPart = domainRankings.getSortRanking(domainId); + long rankEncodedId = UrlIdCodec.addRank(rankingPart, docId); final int wordId = record.wordId(); long offset = startOfRange(wordId); diff --git a/code/features-index/index-reverse/src/main/java/nu/marginalia/index/full/ReverseIndexFullReader.java b/code/features-index/index-reverse/src/main/java/nu/marginalia/index/full/ReverseIndexFullReader.java index 6342c436..714ec483 100644 --- a/code/features-index/index-reverse/src/main/java/nu/marginalia/index/full/ReverseIndexFullReader.java +++ b/code/features-index/index-reverse/src/main/java/nu/marginalia/index/full/ReverseIndexFullReader.java @@ -111,10 +111,23 @@ public class ReverseIndexFullReader { return new long[docIds.length]; } - Arrays.sort(docIds); + assert isSorted(docIds) : "The input array docIds is assumed to be sorted"; var reader = createReaderNew(offset); return reader.queryData(docIds, 1); } + private boolean isSorted(long[] ids) { + if (ids.length == 0) + return true; + long prev = ids[0]; + + for (int i = 1; i < ids.length; i++) { + if(ids[i] <= prev) + return false; + } + + return true; + } + } diff --git a/code/features-index/index-reverse/src/main/java/nu/marginalia/index/priority/ReverseIndexPriorityConverter.java b/code/features-index/index-reverse/src/main/java/nu/marginalia/index/priority/ReverseIndexPriorityConverter.java index 4c9cd0d0..90368c5a 100644 --- a/code/features-index/index-reverse/src/main/java/nu/marginalia/index/priority/ReverseIndexPriorityConverter.java +++ b/code/features-index/index-reverse/src/main/java/nu/marginalia/index/priority/ReverseIndexPriorityConverter.java @@ -10,6 +10,7 @@ import nu.marginalia.index.construction.IndexSizeEstimator; import nu.marginalia.index.journal.model.IndexJournalEntryData; import nu.marginalia.index.journal.model.IndexJournalStatistics; import nu.marginalia.index.journal.reader.IndexJournalReader; +import nu.marginalia.model.id.UrlIdCodec; import nu.marginalia.ranking.DomainRankings; import nu.marginalia.rwf.RandomWriteFunnel; import nu.marginalia.service.control.ServiceHeartbeat; @@ -178,21 +179,9 @@ public class ReverseIndexPriorityConverter { @SneakyThrows @Override public void accept(long docId, IndexJournalEntryData.Record record) { - - /* Encode the ID as - * - * 32 bits 32 bits - * [ ranking | url-id ] - * - * in order to get low-ranking documents to be considered first - * when sorting the items. - */ - - int domainId = (int) (docId >>> 32); - long rankingId = (long) domainRankings.getRanking(domainId) << 32; - - int urlId = (int) (docId & 0xFFFF_FFFFL); - long rankEncodedId = rankingId | urlId; + int domainId = UrlIdCodec.getDomainId(docId); + float rankingPart = domainRankings.getSortRanking(domainId); + long rankEncodedId = UrlIdCodec.addRank(rankingPart, docId); final int wordId = record.wordId(); long offset = startOfRange(wordId); diff --git a/code/features-index/index-reverse/src/test/java/nu/marginalia/index/reverse/ReverseIndexFullConverterTest.java b/code/features-index/index-reverse/src/test/java/nu/marginalia/index/reverse/ReverseIndexFullConverterTest.java index 7644d019..49365e1a 100644 --- a/code/features-index/index-reverse/src/test/java/nu/marginalia/index/reverse/ReverseIndexFullConverterTest.java +++ b/code/features-index/index-reverse/src/test/java/nu/marginalia/index/reverse/ReverseIndexFullConverterTest.java @@ -9,6 +9,7 @@ import nu.marginalia.index.journal.reader.IndexJournalReaderSingleCompressedFile import nu.marginalia.index.journal.writer.IndexJournalWriterImpl; import nu.marginalia.index.journal.writer.IndexJournalWriter; import nu.marginalia.lexicon.journal.KeywordLexiconJournalMode; +import nu.marginalia.model.id.UrlIdCodec; import nu.marginalia.ranking.DomainRankings; import nu.marginalia.lexicon.KeywordLexicon; import nu.marginalia.lexicon.journal.KeywordLexiconJournal; @@ -113,17 +114,17 @@ class ReverseIndexFullConverterTest { var buffer = new LongQueryBuffer(32); reverseIndexReader.documents(keywordLexicon.getReadOnly("1")).read(buffer); - assertArrayEquals(LongStream.range(1, 17).map(v -> v | (255L << 32)).toArray(), buffer.copyData()); + assertArrayEquals(LongStream.range(1, 17).map(this::addMaxRank).toArray(), buffer.copyData()); System.out.println(buffer); buffer.reset(); reverseIndexReader.documents(keywordLexicon.getReadOnly("2")).read(buffer); - assertArrayEquals(LongStream.range(1, 17).map(v -> v*2).map(v -> v | (255L << 32)).toArray(), buffer.copyData()); + assertArrayEquals(LongStream.range(1, 17).map(v -> v*2).map(this::addMaxRank).toArray(), buffer.copyData()); System.out.println(buffer); buffer.reset(); reverseIndexReader.documents(keywordLexicon.getReadOnly("3")).read(buffer); - assertArrayEquals(LongStream.range(1, 17).map(v -> v*3).map(v -> v | (255L << 32)).toArray(), buffer.copyData()); + assertArrayEquals(LongStream.range(1, 17).map(v -> v*3).map(this::addMaxRank).toArray(), buffer.copyData()); System.out.println(buffer); buffer.reset(); @@ -137,4 +138,9 @@ class ReverseIndexFullConverterTest { TestUtil.clearTempDir(dataDir); } + + // Add a max domain rank component to the input, when interpreted as an ID + private long addMaxRank(long in) { + return UrlIdCodec.addRank(1f, in); + } } \ No newline at end of file diff --git a/code/features-index/index-reverse/src/test/java/nu/marginalia/index/reverse/ReverseIndexFullConverterTest2.java b/code/features-index/index-reverse/src/test/java/nu/marginalia/index/reverse/ReverseIndexFullConverterTest2.java index 4da283a0..e3a9848c 100644 --- a/code/features-index/index-reverse/src/test/java/nu/marginalia/index/reverse/ReverseIndexFullConverterTest2.java +++ b/code/features-index/index-reverse/src/test/java/nu/marginalia/index/reverse/ReverseIndexFullConverterTest2.java @@ -11,6 +11,7 @@ import nu.marginalia.index.journal.writer.IndexJournalWriterImpl; import nu.marginalia.index.journal.writer.IndexJournalWriter; import nu.marginalia.index.priority.ReverseIndexPriorityParameters; import nu.marginalia.lexicon.journal.KeywordLexiconJournalMode; +import nu.marginalia.model.id.UrlIdCodec; import nu.marginalia.ranking.DomainRankings; import nu.marginalia.lexicon.KeywordLexicon; import nu.marginalia.lexicon.journal.KeywordLexiconJournal; @@ -101,8 +102,8 @@ class ReverseIndexFullConverterTest2 { return LongStream.rangeClosed(1, id).filter(v -> (id % v) == 0).toArray(); } - long createId(long url, long domain) { - return (domain << 32) | url; + long createId(int url, int domain) { + return UrlIdCodec.encodeId(domain, url); } public void createEntry(IndexJournalWriter writer, KeywordLexicon keywordLexicon, int id) { int[] factors = getFactorsI(id); diff --git a/code/features-index/index-reverse/src/test/java/nu/marginalia/index/reverse/ReverseIndexPriorityConverterTest2.java b/code/features-index/index-reverse/src/test/java/nu/marginalia/index/reverse/ReverseIndexPriorityConverterTest2.java index a5ad6940..12e72f99 100644 --- a/code/features-index/index-reverse/src/test/java/nu/marginalia/index/reverse/ReverseIndexPriorityConverterTest2.java +++ b/code/features-index/index-reverse/src/test/java/nu/marginalia/index/reverse/ReverseIndexPriorityConverterTest2.java @@ -13,6 +13,7 @@ import nu.marginalia.index.priority.ReverseIndexPriorityParameters; import nu.marginalia.lexicon.KeywordLexicon; import nu.marginalia.lexicon.journal.KeywordLexiconJournal; import nu.marginalia.lexicon.journal.KeywordLexiconJournalMode; +import nu.marginalia.model.id.UrlIdCodec; import nu.marginalia.ranking.DomainRankings; import nu.marginalia.service.control.ServiceHeartbeat; import nu.marginalia.service.control.ServiceTaskHeartbeat; @@ -101,8 +102,8 @@ class ReverseIndexPriorityConverterTest2 { return LongStream.rangeClosed(1, id).filter(v -> (id % v) == 0).toArray(); } - long createId(long url, long domain) { - return (domain << 32) | url; + long createId(int url, int domain) { + return UrlIdCodec.encodeId(domain, url); } public void createEntry(IndexJournalWriter writer, KeywordLexicon keywordLexicon, int id) { int[] factors = getFactorsI(id); diff --git a/code/services-core/index-service/src/main/java/nu/marginalia/index/index/SearchIndex.java b/code/services-core/index-service/src/main/java/nu/marginalia/index/index/SearchIndex.java index d4664531..0da02a42 100644 --- a/code/services-core/index-service/src/main/java/nu/marginalia/index/index/SearchIndex.java +++ b/code/services-core/index-service/src/main/java/nu/marginalia/index/index/SearchIndex.java @@ -188,7 +188,11 @@ public class SearchIndex { indexReader.numHitsPrio(b) ); } - /** Replaces the values of ids with their associated metadata, or 0L if absent */ + + /** Return an array of encoded document metadata longs corresponding to the + * document identifiers provided; with metadata for termId. The input array + * docs[] *must* be sorted. + */ public long[] getTermMetadata(int termId, long[] docs) { return indexReader.getMetadata(termId, docs); } @@ -200,10 +204,6 @@ public class SearchIndex { return indexReader.getHtmlFeatures(docId); } - public int getDomainId(long docId) { - return indexReader.getDomainId(docId); - } - public int getTotalDocCount() { return indexReader.totalDocCount(); } diff --git a/code/services-core/index-service/src/main/java/nu/marginalia/index/index/SearchIndexReader.java b/code/services-core/index-service/src/main/java/nu/marginalia/index/index/SearchIndexReader.java index 8a3e3e6e..55a899b7 100644 --- a/code/services-core/index-service/src/main/java/nu/marginalia/index/index/SearchIndexReader.java +++ b/code/services-core/index-service/src/main/java/nu/marginalia/index/index/SearchIndexReader.java @@ -60,10 +60,6 @@ public class SearchIndexReader { return forwardIndexReader.getDocMeta(docId); } - public int getDomainId(long docId) { - return forwardIndexReader.getDomainId(docId); - } - public int totalDocCount() { return forwardIndexReader.totalDocCount(); } diff --git a/code/services-core/index-service/src/main/java/nu/marginalia/index/results/IndexMetadataService.java b/code/services-core/index-service/src/main/java/nu/marginalia/index/results/IndexMetadataService.java index 996afafa..7bbb5351 100644 --- a/code/services-core/index-service/src/main/java/nu/marginalia/index/results/IndexMetadataService.java +++ b/code/services-core/index-service/src/main/java/nu/marginalia/index/results/IndexMetadataService.java @@ -3,6 +3,7 @@ package nu.marginalia.index.results; import com.google.inject.Inject; import gnu.trove.map.hash.TObjectIntHashMap; import gnu.trove.set.hash.TLongHashSet; +import it.unimi.dsi.fastutil.ints.Int2ObjectArrayMap; import it.unimi.dsi.fastutil.ints.IntArrayList; import it.unimi.dsi.fastutil.longs.Long2LongOpenHashMap; import nu.marginalia.index.client.model.query.SearchSubquery; @@ -18,7 +19,6 @@ import java.util.OptionalInt; public class IndexMetadataService { private final SearchIndex index; private final SearchTermsService searchTermsService; - private final ResultValuator searchResultValuator; @Inject @@ -30,34 +30,16 @@ public class IndexMetadataService { this.searchResultValuator = searchResultValuator; } - public long getDocumentMetadata(long urlId) { - return index.getDocumentMetadata(urlId); + public long getDocumentMetadata(long docId) { + return index.getDocumentMetadata(docId); } public int getHtmlFeatures(long urlId) { return index.getHtmlFeatures(urlId); } - public int getDomainId(long urlId) { - return index.getDomainId(urlId); - } - - public long[] getTermMetadata(int termId, long[] docIdsAll) { - return index.getTermMetadata(termId, docIdsAll); - } - - public TermMetadata getTermMetadata(long[] docIdsAll, int[] termIdsList) { - var termdocToMeta = new Long2LongOpenHashMap(docIdsAll.length * termIdsList.length, 0.5f); - - for (int term : termIdsList) { - var metadata = getTermMetadata(term, docIdsAll); - - for (int i = 0; i < docIdsAll.length; i++) { - termdocToMeta.put(termdocKey(term, docIdsAll[i]), metadata[i]); - } - } - - return new TermMetadata(termdocToMeta); + public TermMetadataForDocuments getTermMetadataForDocuments(long[] docIdsAll, int[] termIdsList) { + return new TermMetadataForDocuments(docIdsAll, termIdsList); } public QuerySearchTerms getSearchTerms(List searchTermVariants) { @@ -80,7 +62,6 @@ public class IndexMetadataService { } } - return new QuerySearchTerms(termToId, termIdsList.toIntArray(), getTermCoherences(searchTermVariants)); @@ -92,7 +73,10 @@ public class IndexMetadataService { for (var subquery : searchTermVariants) { for (var coh : subquery.searchTermCoherences) { - int[] ids = coh.stream().map(searchTermsService::lookUpWord).filter(OptionalInt::isPresent).mapToInt(OptionalInt::getAsInt).toArray(); + int[] ids = coh.stream().map(searchTermsService::lookUpWord) + .filter(OptionalInt::isPresent) + .mapToInt(OptionalInt::getAsInt) + .toArray(); coherences.add(ids); } @@ -116,30 +100,43 @@ public class IndexMetadataService { var ret = new TLongHashSet(resultsArray.length); for (int priorityTerm : priorityTermIds) { - long[] metadata = getTermMetadata(priorityTerm, resultsArray); + long[] metadata = index.getTermMetadata(priorityTerm, resultsArray); for (int i = 0; i < metadata.length; i++) { if (metadata[i] != 0) ret.add(resultsArray[i]); } } return ret; - - } public ResultValuator getSearchResultValuator() { return searchResultValuator; } - public static class TermMetadata { - private final Long2LongOpenHashMap termdocToMeta; + public class TermMetadataForDocuments { + private final Int2ObjectArrayMap termdocToMeta; - public TermMetadata(Long2LongOpenHashMap termdocToMeta) { - this.termdocToMeta = termdocToMeta; + public TermMetadataForDocuments(long[] docIdsAll, int[] termIdsList) { + termdocToMeta = new Int2ObjectArrayMap<>(termIdsList.length); + + for (int termId : termIdsList) { + var mapForTerm = new Long2LongOpenHashMap(docIdsAll.length); + + var metadata = index.getTermMetadata(termId, docIdsAll); + for (int i = 0; i < docIdsAll.length; i++) { + mapForTerm.put(docIdsAll[i], metadata[i]); + } + + termdocToMeta.put(termId, mapForTerm); + } } public long getTermMetadata(int termId, long docId) { - return termdocToMeta.getOrDefault(termdocKey(termId, docId), 0); + var docsForTerm = termdocToMeta.get(termId); + if (docsForTerm == null) { + return 0; + } + return docsForTerm.getOrDefault(docId, 0); } public boolean testCoherence(long docId, TermCoherences coherences) { @@ -164,20 +161,19 @@ public class IndexMetadataService { public final TermCoherences coherences; - public QuerySearchTerms(TObjectIntHashMap termToId, int[] termIdsAll, TermCoherences coherences) { + public QuerySearchTerms(TObjectIntHashMap termToId, + int[] termIdsAll, + TermCoherences coherences) { this.termToId = termToId; this.termIdsAll = termIdsAll; this.coherences = coherences; } - public int get(String searchTerm) { + public int getIdForTerm(String searchTerm) { return termToId.get(searchTerm); } } + /** wordIds that we require to be in the same sentence */ public record TermCoherences(List words) {} - - private static long termdocKey(int termId, long docId) { - return (docId << 32) | Integer.toUnsignedLong(termId); - } } diff --git a/code/services-core/index-service/src/main/java/nu/marginalia/index/results/IndexResultValuator.java b/code/services-core/index-service/src/main/java/nu/marginalia/index/results/IndexResultValuator.java index 34ea1826..c8255f49 100644 --- a/code/services-core/index-service/src/main/java/nu/marginalia/index/results/IndexResultValuator.java +++ b/code/services-core/index-service/src/main/java/nu/marginalia/index/results/IndexResultValuator.java @@ -13,6 +13,7 @@ import nu.marginalia.index.client.model.query.SearchSubquery; import nu.marginalia.index.query.IndexQueryParams; import nu.marginalia.ranking.ResultValuator; +import java.util.Arrays; import java.util.List; public class IndexResultValuator { @@ -21,7 +22,7 @@ public class IndexResultValuator { private final IndexQueryParams queryParams; private final TLongHashSet resultsWithPriorityTerms; - private final IndexMetadataService.TermMetadata termMetadata; + private final IndexMetadataService.TermMetadataForDocuments termMetadataForDocuments; private final IndexMetadataService.QuerySearchTerms searchTerms; private final ResultRankingContext rankingContext; @@ -36,16 +37,17 @@ public class IndexResultValuator { this.rankingContext = rankingContext; this.searchResultValuator = metadataService.getSearchResultValuator(); - final long[] resultsArray = results.toArray(); + final long[] ids = results.toArray(); + Arrays.sort(ids); this.searchTermVariants = subqueries.stream().map(sq -> sq.searchTermsInclude).distinct().toList(); this.queryParams = queryParams; this.metadataService = metadataService; this.searchTerms = metadataService.getSearchTerms(subqueries); - this.termMetadata = metadataService.getTermMetadata(results.toArray(), searchTerms.termIdsAll); + this.termMetadataForDocuments = metadataService.getTermMetadataForDocuments(ids, searchTerms.termIdsAll); - resultsWithPriorityTerms = metadataService.getResultsWithPriorityTerms(subqueries, resultsArray); + resultsWithPriorityTerms = metadataService.getResultsWithPriorityTerms(subqueries, ids); } private final long flagsFilterMask = @@ -54,12 +56,10 @@ public class IndexResultValuator { public SearchResultItem calculatePreliminaryScore(long id) { SearchResultItem searchResult = new SearchResultItem(id); - final long urlIdInt = searchResult.getUrlIdInt(); + final long docId = searchResult.getDocumentId(); - searchResult.setDomainId(metadataService.getDomainId(urlIdInt)); - - long docMetadata = metadataService.getDocumentMetadata(urlIdInt); - int htmlFeatures = metadataService.getHtmlFeatures(urlIdInt); + long docMetadata = metadataService.getDocumentMetadata(docId); + int htmlFeatures = metadataService.getHtmlFeatures(docId); int maxFlagsCount = 0; boolean anyAllSynthetic = false; @@ -76,21 +76,21 @@ public class IndexResultValuator { for (int termIdx = 0; termIdx < termList.size(); termIdx++) { String searchTerm = termList.get(termIdx); - long metadata = termMetadata.getTermMetadata( - searchTerms.get(searchTerm), - searchResult.getUrlIdInt() + long termMetadata = termMetadataForDocuments.getTermMetadata( + searchTerms.getIdForTerm(searchTerm), + searchResult.combinedId ); var score = new SearchResultKeywordScore( querySetId, searchTerm, - metadata, + termMetadata, docMetadata, htmlFeatures, resultsWithPriorityTerms.contains(searchResult.combinedId) ); - synthetic &= WordFlags.Synthetic.isPresent(metadata); + synthetic &= WordFlags.Synthetic.isPresent(termMetadata); searchResult.keywordScores.add(score); @@ -117,11 +117,13 @@ public class IndexResultValuator { final boolean hasPriorityTerm = resultsWithPriorityTerms.contains(id); - double score = searchResultValuator.calculateSearchResultValue(searchResult.keywordScores, 5000, rankingContext); + double score = searchResultValuator.calculateSearchResultValue(searchResult.keywordScores, + 5000, + rankingContext); boolean disqualified = false; - if (!termMetadata.testCoherence(urlIdInt, searchTerms.coherences)) + if (!termMetadataForDocuments.testCoherence(docId, searchTerms.coherences)) disqualified = true; else if (maxFlagsCount == 0 && !anyAllSynthetic && maxPositionsSet == 0) disqualified = true; diff --git a/code/services-core/index-service/src/main/java/nu/marginalia/index/svc/IndexQueryService.java b/code/services-core/index-service/src/main/java/nu/marginalia/index/svc/IndexQueryService.java index c100388e..0517689f 100644 --- a/code/services-core/index-service/src/main/java/nu/marginalia/index/svc/IndexQueryService.java +++ b/code/services-core/index-service/src/main/java/nu/marginalia/index/svc/IndexQueryService.java @@ -266,9 +266,7 @@ public class IndexQueryService { var domainCountFilter = new IndexResultDomainDeduplicator(params.limitByDomain); - results.sort(Comparator.comparing(SearchResultItem::getScore).reversed() - .thenComparingInt(SearchResultItem::getRanking) - .thenComparingInt(SearchResultItem::getUrlIdInt)); + results.sort(Comparator.naturalOrder()); List resultsList = new ArrayList<>(results.size()); diff --git a/code/services-core/index-service/src/main/java/nu/marginalia/index/svc/searchset/RankingSearchSet.java b/code/services-core/index-service/src/main/java/nu/marginalia/index/svc/searchset/RankingSearchSet.java index 64507955..d8dd9ca1 100644 --- a/code/services-core/index-service/src/main/java/nu/marginalia/index/svc/searchset/RankingSearchSet.java +++ b/code/services-core/index-service/src/main/java/nu/marginalia/index/svc/searchset/RankingSearchSet.java @@ -63,12 +63,13 @@ public class RankingSearchSet implements SearchSet { } @Override - public boolean contains(int urlId, long documentMetadata) { + public boolean contains(int domainId, long documentMetadata) { // This is the main check - if (set.contains(urlId) || set.isEmpty()) { + if (set.contains(domainId) || set.isEmpty()) { return true; } + // TODO return false; } diff --git a/code/services-core/index-service/src/main/java/nu/marginalia/index/svc/searchset/SearchSetAny.java b/code/services-core/index-service/src/main/java/nu/marginalia/index/svc/searchset/SearchSetAny.java index 2f457974..b0ee4e39 100644 --- a/code/services-core/index-service/src/main/java/nu/marginalia/index/svc/searchset/SearchSetAny.java +++ b/code/services-core/index-service/src/main/java/nu/marginalia/index/svc/searchset/SearchSetAny.java @@ -4,7 +4,7 @@ import nu.marginalia.index.searchset.SearchSet; public class SearchSetAny implements SearchSet { @Override - public boolean contains(int urlId, long meta) { + public boolean contains(int domainId, long meta) { return true; } diff --git a/code/services-core/index-service/src/test/java/nu/marginalia/index/svc/IndexQueryServiceIntegrationTest.java b/code/services-core/index-service/src/test/java/nu/marginalia/index/svc/IndexQueryServiceIntegrationTest.java index 01be347b..2aca711b 100644 --- a/code/services-core/index-service/src/test/java/nu/marginalia/index/svc/IndexQueryServiceIntegrationTest.java +++ b/code/services-core/index-service/src/test/java/nu/marginalia/index/svc/IndexQueryServiceIntegrationTest.java @@ -15,6 +15,7 @@ import nu.marginalia.index.query.limit.QueryLimits; import nu.marginalia.index.query.limit.QueryStrategy; import nu.marginalia.index.query.limit.SpecificationLimit; import nu.marginalia.lexicon.KeywordLexicon; +import nu.marginalia.model.id.UrlIdCodec; import nu.marginalia.model.idx.WordFlags; import nu.marginalia.model.idx.DocumentMetadata; import nu.marginalia.model.idx.WordMetadata; @@ -96,12 +97,14 @@ public class IndexQueryServiceIntegrationTest { List.of("3", "5", "2"), List.of("4"), Collections.emptyList(), Collections.emptyList(), Collections.emptyList()))).build()); - Assertions.assertArrayEquals( - new int[] { 30, 90, 150, 210, 270, 330, 390, 450, 510 }, - rsp.results - .stream() - .mapToInt(SearchResultItem::getUrlIdInt) - .toArray()); + int[] idxes = new int[] { 30, 510, 90, 150, 210, 270, 330, 390, 450 }; + long[] ids = IntStream.of(idxes).mapToLong(this::fullId).toArray(); + long[] actual = rsp.results + .stream() + .mapToLong(SearchResultItem::getDocumentId) + .toArray(); + + Assertions.assertArrayEquals(ids, actual); } @@ -127,9 +130,11 @@ public class IndexQueryServiceIntegrationTest { .subqueries(List.of(new SearchSubquery( List.of("3", "5", "2"), List.of("4"), Collections.emptyList(), Collections.emptyList(), Collections.emptyList()))).build()); - Assertions.assertArrayEquals( - new int[] { 210, 270 }, - rsp.results.stream().mapToInt(SearchResultItem::getUrlIdInt).toArray()); + int[] idxes = new int[] { 210, 270 }; + long[] ids = IntStream.of(idxes).mapToLong(id -> UrlIdCodec.encodeId(id/100, id)).toArray(); + long[] actual = rsp.results.stream().mapToLong(SearchResultItem::getDocumentId).toArray(); + + Assertions.assertArrayEquals(ids, actual); } @Test @@ -169,13 +174,17 @@ public class IndexQueryServiceIntegrationTest { } + private long fullId(int id) { + return UrlIdCodec.encodeId((32 - (id % 32)), id); + } + public void loadData(int id) { int[] factors = IntStream .rangeClosed(1, id) .filter(v -> (id % v) == 0) .toArray(); - long fullId = id | ((long) (32 - (id % 32)) << 32); + long fullId = fullId(id); var header = new IndexJournalEntryHeader(factors.length, 0, fullId, new DocumentMetadata(0, 0, 0, 0, id % 5, id, id % 20, (byte) 0).encode()); @@ -190,7 +199,7 @@ public class IndexQueryServiceIntegrationTest { public void loadDataWithDomain(int domain, int id) { int[] factors = IntStream.rangeClosed(1, id).filter(v -> (id % v) == 0).toArray(); - var header = new IndexJournalEntryHeader(factors.length, 0, id | ((long) domain << 32), DocumentMetadata.defaultValue()); + var header = new IndexJournalEntryHeader(factors.length, 0, UrlIdCodec.encodeId(domain, id), DocumentMetadata.defaultValue()); long[] data = new long[factors.length*2]; for (int i = 0; i < factors.length; i++) {