(index) Implement new URL ID coding scheme.

Also refactor along the way.  Really needs an additional pass, these tests are very hairy.
This commit is contained in:
Viktor Lofgren 2023-08-24 16:44:27 +02:00
parent 6a04cdfddf
commit 9894f37412
28 changed files with 227 additions and 227 deletions

View File

@ -4,13 +4,15 @@ import lombok.AllArgsConstructor;
import lombok.Getter;
import nu.marginalia.model.EdgeUrl;
import nu.marginalia.model.id.EdgeId;
import nu.marginalia.model.id.UrlIdCodec;
import org.jetbrains.annotations.NotNull;
import java.util.ArrayList;
import java.util.List;
/** Represents a document matching a search query */
@AllArgsConstructor @Getter
public class SearchResultItem {
public class SearchResultItem implements Comparable<SearchResultItem> {
/** Encoded ID that contains both the URL id and its ranking */
public final long combinedId;
@ -25,15 +27,22 @@ public class SearchResultItem {
this.keywordScores = new ArrayList<>(16);
}
@Deprecated
public EdgeId<EdgeUrl> getUrlId() {
return new EdgeId<>(getUrlIdInt());
}
public long getDocumentId() {
return UrlIdCodec.removeRank(combinedId);
}
@Deprecated
public int getUrlIdInt() {
return (int)(combinedId & 0xFFFF_FFFFL);
}
public int getRanking() {
return (int)(combinedId >>> 32);
return UrlIdCodec.getRank(combinedId);
}
/* Used for evaluation */
@ -45,16 +54,12 @@ public class SearchResultItem {
return scoreValue;
}
private transient int domainId = Integer.MIN_VALUE;
public void setDomainId(int domainId) {
this.domainId = domainId;
}
public int getDomainId() {
return this.domainId;
return UrlIdCodec.getDomainId(this.combinedId);
}
public int hashCode() {
return getUrlIdInt();
return Long.hashCode(combinedId);
}
public String toString() {
@ -67,7 +72,7 @@ public class SearchResultItem {
if (other == this)
return true;
if (other instanceof SearchResultItem o) {
return o.getUrlIdInt() == getUrlIdInt();
return o.getDocumentId() == getDocumentId();
}
return false;
}
@ -81,4 +86,14 @@ public class SearchResultItem {
return domainId;
}
@Override
public int compareTo(@NotNull SearchResultItem o) {
// this looks like a bug, but we actually want this in a reversed order
int diff = o.getScore().compareTo(getScore());
if (diff != 0)
return diff;
return Long.compare(this.combinedId, o.combinedId);
}
}

View File

@ -29,33 +29,17 @@ package nu.marginalia.model.id;
* </pre></code>
*/
public class UrlIdCodec {
private static final long RANK_MASK = 0x8600_0000_0000_0000L;
private static final long RANK_MASK = 0xFE00_0000_0000_0000L;
private static final int DOCORD_MASK = 0x03FF_FFFF;
/** Encode a URL id without a ranking element */
public static long encodeId(int domainId, int documentOrdinal) {
domainId &= 0x7FFF_FFFFL;
domainId &= 0x7FFF_FFFF;
documentOrdinal &= 0x03FF_FFFF;
return ((long) domainId << 26) | documentOrdinal;
}
/** Encode a URL id with the optional ranking part
*
* @param rank [0,1] the importance of the domain, low is good
* @param domainId
* @param documentOrdinal
* @return
*/
public static long encodeIdWithRank(float rank, int domainId, int documentOrdinal) {
long rankPart = (int)(rank * (1<<6));
if (rankPart >= 64) rankPart = 63;
if (rankPart < 0) rankPart = 0;
return encodeId(domainId, documentOrdinal) | (rankPart << 57);
}
/** Add a ranking element to an existing combined URL id.
*
* @param rank [0,1] the importance of the domain, low is good
@ -88,7 +72,7 @@ public class UrlIdCodec {
/** Mask out the ranking element from this URL id */
public static long removeRank(long combinedId) {
return combinedId & (~RANK_MASK);
return combinedId & ~RANK_MASK;
}
}

View File

@ -27,6 +27,25 @@ class UrlIdCodecTest {
assertEquals(0, UrlIdCodec.getDocumentOrdinal(encoded));
}
@Test
public void testRankBoundsAdd() {
long encoded = UrlIdCodec.encodeId(0, 0);
encoded = UrlIdCodec.addRank(1.f, encoded);
assertEquals(0, UrlIdCodec.getDomainId(encoded));
assertEquals(63, UrlIdCodec.getRank(encoded));
assertEquals(0, UrlIdCodec.getDocumentOrdinal(encoded));
}
@Test
public void testRemoveRank() {
long encoded = UrlIdCodec.encodeId(0x7FFF_FFFF, ~0);
encoded = UrlIdCodec.addRank(1.f, encoded);
encoded = UrlIdCodec.removeRank(encoded);
assertEquals(0x7FFF_FFFFL, UrlIdCodec.getDomainId(encoded));
assertEquals(0, UrlIdCodec.getRank(encoded));
assertEquals(0x03FF_FFFF, UrlIdCodec.getDocumentOrdinal(encoded));
}
@Test
public void testRankBoundsNeg() {
long encoded = UrlIdCodec.encodeIdWithRank(-1.0f, 0, 0);

View File

@ -37,6 +37,10 @@ public class DomainRankings {
return rankings.getOrDefault(domainId, (short) MAX_RANK_VALUE);
}
public float getSortRanking(int domainId) {
return rankings.getOrDefault(domainId, (short) MAX_RANK_VALUE) / (float) MAX_RANK_VALUE;
}
public int size() {
return rankings.size();
}

View File

@ -10,6 +10,8 @@ import nu.marginalia.ranking.DomainRankings;
import nu.marginalia.service.control.ServiceHeartbeat;
import org.roaringbitmap.IntConsumer;
import org.roaringbitmap.RoaringBitmap;
import org.roaringbitmap.longlong.LongConsumer;
import org.roaringbitmap.longlong.Roaring64Bitmap;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
@ -83,12 +85,11 @@ public class ForwardIndexConverter {
LongArray docFileData = LongArray.mmapForWriting(outputFileDocsData, ForwardIndexParameters.ENTRY_SIZE * docsFileId.size());
journalReader.forEach(entry -> {
long entryOffset = (long) ForwardIndexParameters.ENTRY_SIZE * docIdToIdx.get(entry.urlId());
long entryOffset = (long) ForwardIndexParameters.ENTRY_SIZE * docIdToIdx.get(entry.docId());
int ranking = domainRankings.getRanking(entry.domainId());
long meta = DocumentMetadata.encodeRank(entry.docMeta(), ranking);
docFileData.set(entryOffset + ForwardIndexParameters.DOMAIN_OFFSET, entry.domainId());
docFileData.set(entryOffset + ForwardIndexParameters.METADATA_OFFSET, meta);
docFileData.set(entryOffset + ForwardIndexParameters.FEATURES_OFFSET, entry.header.documentFeatures());
});
@ -109,17 +110,18 @@ public class ForwardIndexConverter {
}
private LongArray getDocIds(Path outputFileDocs, IndexJournalReader journalReader) throws IOException {
RoaringBitmap rbm = new RoaringBitmap();
journalReader.forEachUrlId(rbm::add);
Roaring64Bitmap rbm = new Roaring64Bitmap();
journalReader.forEachDocId(rbm::add);
LongArray ret = LongArray.mmapForWriting(outputFileDocs, rbm.getCardinality());
rbm.forEach(new IntConsumer() {
LongArray ret = LongArray.mmapForWriting(outputFileDocs, rbm.getIntCardinality());
rbm.forEach(new LongConsumer() {
int offset;
@Override
public void accept(int value) {
public void accept(long value) {
ret.set(offset++, value);
}
});
return ret;
}

View File

@ -1,9 +1,8 @@
package nu.marginalia.index.forward;
class ForwardIndexParameters {
public static final int ENTRY_SIZE = 3;
public static final int DOMAIN_OFFSET = 0;
public static final int METADATA_OFFSET = 1;
public static final int FEATURES_OFFSET = 2;
public static final int ENTRY_SIZE = 2;
public static final int METADATA_OFFSET = 0;
public static final int FEATURES_OFFSET = 1;
}

View File

@ -3,6 +3,7 @@ package nu.marginalia.index.forward;
import com.upserve.uppend.blobs.NativeIO;
import gnu.trove.map.hash.TLongIntHashMap;
import nu.marginalia.array.LongArray;
import nu.marginalia.model.id.UrlIdCodec;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
@ -71,6 +72,8 @@ public class ForwardIndexReader {
}
public long getDocMeta(long docId) {
assert UrlIdCodec.getRank(docId) == 0 : "Forward Index Reader fed dirty reverse index id";
long offset = idxForDoc(docId);
if (offset < 0) return 0;
@ -78,20 +81,17 @@ public class ForwardIndexReader {
}
public int getHtmlFeatures(long docId) {
assert UrlIdCodec.getRank(docId) == 0 : "Forward Index Reader fed dirty reverse index id";
long offset = idxForDoc(docId);
if (offset < 0) return 0;
return (int) data.get(ENTRY_SIZE * offset + FEATURES_OFFSET);
}
public int getDomainId(long docId) {
long offset = idxForDoc(docId);
if (offset < 0) return 0;
return Math.max(0, (int) data.get(ENTRY_SIZE * offset + DOMAIN_OFFSET));
}
private int idxForDoc(long docId) {
assert UrlIdCodec.getRank(docId) == 0 : "Forward Index Reader fed dirty reverse index id";
return idToOffset.get(docId);
}

View File

@ -1,5 +1,6 @@
package nu.marginalia.index.forward;
import nu.marginalia.model.id.UrlIdCodec;
import nu.marginalia.model.idx.DocumentMetadata;
import nu.marginalia.index.query.limit.SpecificationLimitType;
import nu.marginalia.index.query.IndexQueryParams;
@ -15,10 +16,11 @@ public class ParamMatchingQueryFilter implements QueryFilterStepIf {
}
@Override
public boolean test(long docId) {
int urlId = (int) (docId & 0xFFFF_FFFFL);
int domainId = forwardIndexReader.getDomainId(urlId);
long meta = forwardIndexReader.getDocMeta(urlId);
public boolean test(long combinedId) {
long docId = UrlIdCodec.removeRank(combinedId);
int domainId = UrlIdCodec.getDomainId(docId);
long meta = forwardIndexReader.getDocMeta(docId);
if (!validateDomain(domainId, meta)) {
return false;

View File

@ -113,8 +113,9 @@ class ForwardIndexConverterTest {
var forwardReader = new ForwardIndexReader(docsFileId, docsFileData);
for (int i = 36; i < workSetSize; i++) {
assertEquals(0x00FF000000000000L | (i % 5), forwardReader.getDocMeta(i));
assertEquals(i/20, forwardReader.getDomainId(i));
long docId = createId(i, i/20);
assertEquals(0x00FF000000000000L | (i % 5), forwardReader.getDocMeta(docId));
assertEquals(i/20, UrlIdCodec.getDomainId(docId));
}
}

View File

@ -2,6 +2,7 @@ package nu.marginalia.index.journal.reader;
import nu.marginalia.index.journal.model.IndexJournalEntryData;
import nu.marginalia.index.journal.model.IndexJournalEntryHeader;
import nu.marginalia.model.id.UrlIdCodec;
import java.io.DataInputStream;
import java.io.IOException;
@ -51,11 +52,7 @@ public class IndexJournalReadEntry {
}
public int domainId() {
return (int) (docId() >>> 32L);
}
public int urlId() {
return (int) (docId() & 0xFFFF_FFFFL);
return UrlIdCodec.getDomainId(docId());
}
public IndexJournalEntryData readEntry() {

View File

@ -8,6 +8,7 @@ import org.jetbrains.annotations.NotNull;
import java.io.IOException;
import java.util.Iterator;
import java.util.function.IntConsumer;
import java.util.function.LongConsumer;
public interface IndexJournalReader extends Iterable<IndexJournalReadEntry> {
int FILE_HEADER_SIZE_LONGS = 2;
@ -19,13 +20,12 @@ public interface IndexJournalReader extends Iterable<IndexJournalReadEntry> {
void forEachWordId(IntConsumer consumer);
void forEachUrlIdWordId(BiIntConsumer consumer);
void forEachDocIdWordId(LongIntConsumer consumer);
void forEachDocIdRecord(LongObjectConsumer<IndexJournalEntryData.Record> consumer);
void forEachUrlId(IntConsumer consumer);
void forEachDocId(LongConsumer consumer);
@NotNull
@Override

View File

@ -14,6 +14,7 @@ import java.nio.file.Path;
import java.nio.file.StandardOpenOption;
import java.util.Iterator;
import java.util.function.IntConsumer;
import java.util.function.LongConsumer;
import java.util.function.Predicate;
public class IndexJournalReaderSingleCompressedFile implements IndexJournalReader {
@ -115,19 +116,6 @@ public class IndexJournalReaderSingleCompressedFile implements IndexJournalReade
}
}
@Override
public void forEachUrlIdWordId(BiIntConsumer consumer) {
for (var entry : this) {
var data = entry.readEntry();
for (var post : data) {
if (filter(entry, post)) {
consumer.accept(entry.urlId(), post.wordId());
}
}
}
}
@Override
public void forEachDocIdWordId(LongIntConsumer consumer) {
for (var entry : this) {
@ -154,10 +142,10 @@ public class IndexJournalReaderSingleCompressedFile implements IndexJournalReade
}
}
@Override
public void forEachUrlId(IntConsumer consumer) {
public void forEachDocId(LongConsumer consumer) {
for (var entry : this) {
if (filter(entry)) {
consumer.accept(entry.urlId());
consumer.accept(entry.docId());
}
}
}

View File

@ -6,6 +6,7 @@ import nu.marginalia.index.journal.reader.IndexJournalReader;
import nu.marginalia.index.journal.reader.IndexJournalReaderSingleCompressedFile;
import nu.marginalia.index.journal.writer.IndexJournalWriterImpl;
import nu.marginalia.lexicon.KeywordLexicon;
import nu.marginalia.model.id.UrlIdCodec;
import org.apache.commons.lang3.tuple.Pair;
import org.junit.jupiter.api.AfterEach;
import org.junit.jupiter.api.BeforeEach;
@ -25,6 +26,9 @@ public class IndexJournalTest {
KeywordLexicon lexicon;
IndexJournalReader reader;
long firstDocId = UrlIdCodec.encodeId(44, 10);
long secondDocId = UrlIdCodec.encodeId(43, 15);
@BeforeEach
public void setUp() throws IOException {
tempFile = Files.createTempFile(getClass().getSimpleName(), ".dat");
@ -65,11 +69,11 @@ public class IndexJournalTest {
}
@Test
public void forEachUrlId() {
List<Integer> expected = List.of(10, 15);
List<Integer> actual = new ArrayList<>();
public void forEachDocId() {
List<Long> expected = List.of(firstDocId, secondDocId);
List<Long> actual = new ArrayList<>();
reader.forEachUrlId(actual::add);
reader.forEachDocId(actual::add);
assertEquals(expected, actual);
}
@ -82,31 +86,15 @@ public class IndexJournalTest {
assertEquals(expected, actual);
}
@Test
public void forEachUrlIdWordId() {
List<Pair<Integer, Integer>> expected = List.of(
Pair.of(10, 1),
Pair.of(10, 2),
Pair.of(10, 3),
Pair.of(10, 5),
Pair.of(15, 5),
Pair.of(15, 6));
List<Pair<Integer, Integer>> actual = new ArrayList<>();
reader.forEachUrlIdWordId((url, word) -> actual.add(Pair.of(url, word)));
assertEquals(expected, actual);
}
@Test
public void forEachDocIdWordId() {
List<Pair<Long, Integer>> expected = List.of(
Pair.of(10L | (44L << 32), 1),
Pair.of(10L | (44L << 32), 2),
Pair.of(10L | (44L << 32), 3),
Pair.of(10L | (44L << 32), 5),
Pair.of(15L | (43L << 32), 5),
Pair.of(15L | (43L << 32), 6));
Pair.of(firstDocId, 1),
Pair.of(firstDocId, 2),
Pair.of(firstDocId, 3),
Pair.of(firstDocId, 5),
Pair.of(secondDocId, 5),
Pair.of(secondDocId, 6));
List<Pair<Long, Integer>> actual = new ArrayList<>();
reader.forEachDocIdWordId((url, word) -> actual.add(Pair.of(url, word)));
@ -116,12 +104,12 @@ public class IndexJournalTest {
@Test
public void forEachDocIdRecord() {
List<Pair<Long, IndexJournalEntryData.Record>> expected = List.of(
Pair.of(10L | (44L << 32), new IndexJournalEntryData.Record(1, 2)),
Pair.of(10L | (44L << 32), new IndexJournalEntryData.Record(2, 3)),
Pair.of(10L | (44L << 32), new IndexJournalEntryData.Record(3, 4)),
Pair.of(10L | (44L << 32), new IndexJournalEntryData.Record(5, 6)),
Pair.of(15L | (43L << 32), new IndexJournalEntryData.Record(5, 5)),
Pair.of(15L | (43L << 32), new IndexJournalEntryData.Record(6, 6))
Pair.of(firstDocId, new IndexJournalEntryData.Record(1, 2)),
Pair.of(firstDocId, new IndexJournalEntryData.Record(2, 3)),
Pair.of(firstDocId, new IndexJournalEntryData.Record(3, 4)),
Pair.of(firstDocId, new IndexJournalEntryData.Record(5, 6)),
Pair.of(secondDocId, new IndexJournalEntryData.Record(5, 5)),
Pair.of(secondDocId, new IndexJournalEntryData.Record(6, 6))
);
List<Pair<Long, IndexJournalEntryData.Record>> actual = new ArrayList<>();

View File

@ -3,10 +3,10 @@ package nu.marginalia.index.searchset;
public interface SearchSet {
/**
* Returns true if the given urlId is contained in the set
* Returns true if the given domainId is contained in the set
* or if the documentMetadata vibes with the set
*
*/
boolean contains(int urlId, long documentMetadata);
boolean contains(int domainId, long documentMetadata);
}

View File

@ -7,6 +7,7 @@ import nu.marginalia.index.construction.IndexSizeEstimator;
import nu.marginalia.index.journal.model.IndexJournalEntryData;
import nu.marginalia.index.journal.model.IndexJournalStatistics;
import nu.marginalia.index.journal.reader.IndexJournalReader;
import nu.marginalia.model.id.UrlIdCodec;
import nu.marginalia.ranking.DomainRankings;
import nu.marginalia.rwf.RandomWriteFunnel;
import nu.marginalia.array.IntArray;
@ -179,21 +180,9 @@ public class ReverseIndexFullConverter {
@SneakyThrows
@Override
public void accept(long docId, IndexJournalEntryData.Record record) {
/* Encode the ID as
*
* 32 bits 32 bits
* [ ranking | url-id ]
*
* in order to get low-ranking documents to be considered first
* when sorting the items.
*/
int domainId = (int) (docId >>> 32);
long rankingId = (long) domainRankings.getRanking(domainId) << 32;
int urlId = (int) (docId & 0xFFFF_FFFFL);
long rankEncodedId = rankingId | urlId;
int domainId = UrlIdCodec.getDomainId(docId);
float rankingPart = domainRankings.getSortRanking(domainId);
long rankEncodedId = UrlIdCodec.addRank(rankingPart, docId);
final int wordId = record.wordId();
long offset = startOfRange(wordId);

View File

@ -111,10 +111,23 @@ public class ReverseIndexFullReader {
return new long[docIds.length];
}
Arrays.sort(docIds);
assert isSorted(docIds) : "The input array docIds is assumed to be sorted";
var reader = createReaderNew(offset);
return reader.queryData(docIds, 1);
}
private boolean isSorted(long[] ids) {
if (ids.length == 0)
return true;
long prev = ids[0];
for (int i = 1; i < ids.length; i++) {
if(ids[i] <= prev)
return false;
}
return true;
}
}

View File

@ -10,6 +10,7 @@ import nu.marginalia.index.construction.IndexSizeEstimator;
import nu.marginalia.index.journal.model.IndexJournalEntryData;
import nu.marginalia.index.journal.model.IndexJournalStatistics;
import nu.marginalia.index.journal.reader.IndexJournalReader;
import nu.marginalia.model.id.UrlIdCodec;
import nu.marginalia.ranking.DomainRankings;
import nu.marginalia.rwf.RandomWriteFunnel;
import nu.marginalia.service.control.ServiceHeartbeat;
@ -178,21 +179,9 @@ public class ReverseIndexPriorityConverter {
@SneakyThrows
@Override
public void accept(long docId, IndexJournalEntryData.Record record) {
/* Encode the ID as
*
* 32 bits 32 bits
* [ ranking | url-id ]
*
* in order to get low-ranking documents to be considered first
* when sorting the items.
*/
int domainId = (int) (docId >>> 32);
long rankingId = (long) domainRankings.getRanking(domainId) << 32;
int urlId = (int) (docId & 0xFFFF_FFFFL);
long rankEncodedId = rankingId | urlId;
int domainId = UrlIdCodec.getDomainId(docId);
float rankingPart = domainRankings.getSortRanking(domainId);
long rankEncodedId = UrlIdCodec.addRank(rankingPart, docId);
final int wordId = record.wordId();
long offset = startOfRange(wordId);

View File

@ -9,6 +9,7 @@ import nu.marginalia.index.journal.reader.IndexJournalReaderSingleCompressedFile
import nu.marginalia.index.journal.writer.IndexJournalWriterImpl;
import nu.marginalia.index.journal.writer.IndexJournalWriter;
import nu.marginalia.lexicon.journal.KeywordLexiconJournalMode;
import nu.marginalia.model.id.UrlIdCodec;
import nu.marginalia.ranking.DomainRankings;
import nu.marginalia.lexicon.KeywordLexicon;
import nu.marginalia.lexicon.journal.KeywordLexiconJournal;
@ -113,17 +114,17 @@ class ReverseIndexFullConverterTest {
var buffer = new LongQueryBuffer(32);
reverseIndexReader.documents(keywordLexicon.getReadOnly("1")).read(buffer);
assertArrayEquals(LongStream.range(1, 17).map(v -> v | (255L << 32)).toArray(), buffer.copyData());
assertArrayEquals(LongStream.range(1, 17).map(this::addMaxRank).toArray(), buffer.copyData());
System.out.println(buffer);
buffer.reset();
reverseIndexReader.documents(keywordLexicon.getReadOnly("2")).read(buffer);
assertArrayEquals(LongStream.range(1, 17).map(v -> v*2).map(v -> v | (255L << 32)).toArray(), buffer.copyData());
assertArrayEquals(LongStream.range(1, 17).map(v -> v*2).map(this::addMaxRank).toArray(), buffer.copyData());
System.out.println(buffer);
buffer.reset();
reverseIndexReader.documents(keywordLexicon.getReadOnly("3")).read(buffer);
assertArrayEquals(LongStream.range(1, 17).map(v -> v*3).map(v -> v | (255L << 32)).toArray(), buffer.copyData());
assertArrayEquals(LongStream.range(1, 17).map(v -> v*3).map(this::addMaxRank).toArray(), buffer.copyData());
System.out.println(buffer);
buffer.reset();
@ -137,4 +138,9 @@ class ReverseIndexFullConverterTest {
TestUtil.clearTempDir(dataDir);
}
// Add a max domain rank component to the input, when interpreted as an ID
private long addMaxRank(long in) {
return UrlIdCodec.addRank(1f, in);
}
}

View File

@ -11,6 +11,7 @@ import nu.marginalia.index.journal.writer.IndexJournalWriterImpl;
import nu.marginalia.index.journal.writer.IndexJournalWriter;
import nu.marginalia.index.priority.ReverseIndexPriorityParameters;
import nu.marginalia.lexicon.journal.KeywordLexiconJournalMode;
import nu.marginalia.model.id.UrlIdCodec;
import nu.marginalia.ranking.DomainRankings;
import nu.marginalia.lexicon.KeywordLexicon;
import nu.marginalia.lexicon.journal.KeywordLexiconJournal;
@ -101,8 +102,8 @@ class ReverseIndexFullConverterTest2 {
return LongStream.rangeClosed(1, id).filter(v -> (id % v) == 0).toArray();
}
long createId(long url, long domain) {
return (domain << 32) | url;
long createId(int url, int domain) {
return UrlIdCodec.encodeId(domain, url);
}
public void createEntry(IndexJournalWriter writer, KeywordLexicon keywordLexicon, int id) {
int[] factors = getFactorsI(id);

View File

@ -13,6 +13,7 @@ import nu.marginalia.index.priority.ReverseIndexPriorityParameters;
import nu.marginalia.lexicon.KeywordLexicon;
import nu.marginalia.lexicon.journal.KeywordLexiconJournal;
import nu.marginalia.lexicon.journal.KeywordLexiconJournalMode;
import nu.marginalia.model.id.UrlIdCodec;
import nu.marginalia.ranking.DomainRankings;
import nu.marginalia.service.control.ServiceHeartbeat;
import nu.marginalia.service.control.ServiceTaskHeartbeat;
@ -101,8 +102,8 @@ class ReverseIndexPriorityConverterTest2 {
return LongStream.rangeClosed(1, id).filter(v -> (id % v) == 0).toArray();
}
long createId(long url, long domain) {
return (domain << 32) | url;
long createId(int url, int domain) {
return UrlIdCodec.encodeId(domain, url);
}
public void createEntry(IndexJournalWriter writer, KeywordLexicon keywordLexicon, int id) {
int[] factors = getFactorsI(id);

View File

@ -188,7 +188,11 @@ public class SearchIndex {
indexReader.numHitsPrio(b)
);
}
/** Replaces the values of ids with their associated metadata, or 0L if absent */
/** Return an array of encoded document metadata longs corresponding to the
* document identifiers provided; with metadata for termId. The input array
* docs[] *must* be sorted.
*/
public long[] getTermMetadata(int termId, long[] docs) {
return indexReader.getMetadata(termId, docs);
}
@ -200,10 +204,6 @@ public class SearchIndex {
return indexReader.getHtmlFeatures(docId);
}
public int getDomainId(long docId) {
return indexReader.getDomainId(docId);
}
public int getTotalDocCount() {
return indexReader.totalDocCount();
}

View File

@ -60,10 +60,6 @@ public class SearchIndexReader {
return forwardIndexReader.getDocMeta(docId);
}
public int getDomainId(long docId) {
return forwardIndexReader.getDomainId(docId);
}
public int totalDocCount() {
return forwardIndexReader.totalDocCount();
}

View File

@ -3,6 +3,7 @@ package nu.marginalia.index.results;
import com.google.inject.Inject;
import gnu.trove.map.hash.TObjectIntHashMap;
import gnu.trove.set.hash.TLongHashSet;
import it.unimi.dsi.fastutil.ints.Int2ObjectArrayMap;
import it.unimi.dsi.fastutil.ints.IntArrayList;
import it.unimi.dsi.fastutil.longs.Long2LongOpenHashMap;
import nu.marginalia.index.client.model.query.SearchSubquery;
@ -18,7 +19,6 @@ import java.util.OptionalInt;
public class IndexMetadataService {
private final SearchIndex index;
private final SearchTermsService searchTermsService;
private final ResultValuator searchResultValuator;
@Inject
@ -30,34 +30,16 @@ public class IndexMetadataService {
this.searchResultValuator = searchResultValuator;
}
public long getDocumentMetadata(long urlId) {
return index.getDocumentMetadata(urlId);
public long getDocumentMetadata(long docId) {
return index.getDocumentMetadata(docId);
}
public int getHtmlFeatures(long urlId) {
return index.getHtmlFeatures(urlId);
}
public int getDomainId(long urlId) {
return index.getDomainId(urlId);
}
public long[] getTermMetadata(int termId, long[] docIdsAll) {
return index.getTermMetadata(termId, docIdsAll);
}
public TermMetadata getTermMetadata(long[] docIdsAll, int[] termIdsList) {
var termdocToMeta = new Long2LongOpenHashMap(docIdsAll.length * termIdsList.length, 0.5f);
for (int term : termIdsList) {
var metadata = getTermMetadata(term, docIdsAll);
for (int i = 0; i < docIdsAll.length; i++) {
termdocToMeta.put(termdocKey(term, docIdsAll[i]), metadata[i]);
}
}
return new TermMetadata(termdocToMeta);
public TermMetadataForDocuments getTermMetadataForDocuments(long[] docIdsAll, int[] termIdsList) {
return new TermMetadataForDocuments(docIdsAll, termIdsList);
}
public QuerySearchTerms getSearchTerms(List<SearchSubquery> searchTermVariants) {
@ -80,7 +62,6 @@ public class IndexMetadataService {
}
}
return new QuerySearchTerms(termToId,
termIdsList.toIntArray(),
getTermCoherences(searchTermVariants));
@ -92,7 +73,10 @@ public class IndexMetadataService {
for (var subquery : searchTermVariants) {
for (var coh : subquery.searchTermCoherences) {
int[] ids = coh.stream().map(searchTermsService::lookUpWord).filter(OptionalInt::isPresent).mapToInt(OptionalInt::getAsInt).toArray();
int[] ids = coh.stream().map(searchTermsService::lookUpWord)
.filter(OptionalInt::isPresent)
.mapToInt(OptionalInt::getAsInt)
.toArray();
coherences.add(ids);
}
@ -116,30 +100,43 @@ public class IndexMetadataService {
var ret = new TLongHashSet(resultsArray.length);
for (int priorityTerm : priorityTermIds) {
long[] metadata = getTermMetadata(priorityTerm, resultsArray);
long[] metadata = index.getTermMetadata(priorityTerm, resultsArray);
for (int i = 0; i < metadata.length; i++) {
if (metadata[i] != 0) ret.add(resultsArray[i]);
}
}
return ret;
}
public ResultValuator getSearchResultValuator() {
return searchResultValuator;
}
public static class TermMetadata {
private final Long2LongOpenHashMap termdocToMeta;
public class TermMetadataForDocuments {
private final Int2ObjectArrayMap<Long2LongOpenHashMap> termdocToMeta;
public TermMetadata(Long2LongOpenHashMap termdocToMeta) {
this.termdocToMeta = termdocToMeta;
public TermMetadataForDocuments(long[] docIdsAll, int[] termIdsList) {
termdocToMeta = new Int2ObjectArrayMap<>(termIdsList.length);
for (int termId : termIdsList) {
var mapForTerm = new Long2LongOpenHashMap(docIdsAll.length);
var metadata = index.getTermMetadata(termId, docIdsAll);
for (int i = 0; i < docIdsAll.length; i++) {
mapForTerm.put(docIdsAll[i], metadata[i]);
}
termdocToMeta.put(termId, mapForTerm);
}
}
public long getTermMetadata(int termId, long docId) {
return termdocToMeta.getOrDefault(termdocKey(termId, docId), 0);
var docsForTerm = termdocToMeta.get(termId);
if (docsForTerm == null) {
return 0;
}
return docsForTerm.getOrDefault(docId, 0);
}
public boolean testCoherence(long docId, TermCoherences coherences) {
@ -164,20 +161,19 @@ public class IndexMetadataService {
public final TermCoherences coherences;
public QuerySearchTerms(TObjectIntHashMap<String> termToId, int[] termIdsAll, TermCoherences coherences) {
public QuerySearchTerms(TObjectIntHashMap<String> termToId,
int[] termIdsAll,
TermCoherences coherences) {
this.termToId = termToId;
this.termIdsAll = termIdsAll;
this.coherences = coherences;
}
public int get(String searchTerm) {
public int getIdForTerm(String searchTerm) {
return termToId.get(searchTerm);
}
}
/** wordIds that we require to be in the same sentence */
public record TermCoherences(List<int[]> words) {}
private static long termdocKey(int termId, long docId) {
return (docId << 32) | Integer.toUnsignedLong(termId);
}
}

View File

@ -13,6 +13,7 @@ import nu.marginalia.index.client.model.query.SearchSubquery;
import nu.marginalia.index.query.IndexQueryParams;
import nu.marginalia.ranking.ResultValuator;
import java.util.Arrays;
import java.util.List;
public class IndexResultValuator {
@ -21,7 +22,7 @@ public class IndexResultValuator {
private final IndexQueryParams queryParams;
private final TLongHashSet resultsWithPriorityTerms;
private final IndexMetadataService.TermMetadata termMetadata;
private final IndexMetadataService.TermMetadataForDocuments termMetadataForDocuments;
private final IndexMetadataService.QuerySearchTerms searchTerms;
private final ResultRankingContext rankingContext;
@ -36,16 +37,17 @@ public class IndexResultValuator {
this.rankingContext = rankingContext;
this.searchResultValuator = metadataService.getSearchResultValuator();
final long[] resultsArray = results.toArray();
final long[] ids = results.toArray();
Arrays.sort(ids);
this.searchTermVariants = subqueries.stream().map(sq -> sq.searchTermsInclude).distinct().toList();
this.queryParams = queryParams;
this.metadataService = metadataService;
this.searchTerms = metadataService.getSearchTerms(subqueries);
this.termMetadata = metadataService.getTermMetadata(results.toArray(), searchTerms.termIdsAll);
this.termMetadataForDocuments = metadataService.getTermMetadataForDocuments(ids, searchTerms.termIdsAll);
resultsWithPriorityTerms = metadataService.getResultsWithPriorityTerms(subqueries, resultsArray);
resultsWithPriorityTerms = metadataService.getResultsWithPriorityTerms(subqueries, ids);
}
private final long flagsFilterMask =
@ -54,12 +56,10 @@ public class IndexResultValuator {
public SearchResultItem calculatePreliminaryScore(long id) {
SearchResultItem searchResult = new SearchResultItem(id);
final long urlIdInt = searchResult.getUrlIdInt();
final long docId = searchResult.getDocumentId();
searchResult.setDomainId(metadataService.getDomainId(urlIdInt));
long docMetadata = metadataService.getDocumentMetadata(urlIdInt);
int htmlFeatures = metadataService.getHtmlFeatures(urlIdInt);
long docMetadata = metadataService.getDocumentMetadata(docId);
int htmlFeatures = metadataService.getHtmlFeatures(docId);
int maxFlagsCount = 0;
boolean anyAllSynthetic = false;
@ -76,21 +76,21 @@ public class IndexResultValuator {
for (int termIdx = 0; termIdx < termList.size(); termIdx++) {
String searchTerm = termList.get(termIdx);
long metadata = termMetadata.getTermMetadata(
searchTerms.get(searchTerm),
searchResult.getUrlIdInt()
long termMetadata = termMetadataForDocuments.getTermMetadata(
searchTerms.getIdForTerm(searchTerm),
searchResult.combinedId
);
var score = new SearchResultKeywordScore(
querySetId,
searchTerm,
metadata,
termMetadata,
docMetadata,
htmlFeatures,
resultsWithPriorityTerms.contains(searchResult.combinedId)
);
synthetic &= WordFlags.Synthetic.isPresent(metadata);
synthetic &= WordFlags.Synthetic.isPresent(termMetadata);
searchResult.keywordScores.add(score);
@ -117,11 +117,13 @@ public class IndexResultValuator {
final boolean hasPriorityTerm = resultsWithPriorityTerms.contains(id);
double score = searchResultValuator.calculateSearchResultValue(searchResult.keywordScores, 5000, rankingContext);
double score = searchResultValuator.calculateSearchResultValue(searchResult.keywordScores,
5000,
rankingContext);
boolean disqualified = false;
if (!termMetadata.testCoherence(urlIdInt, searchTerms.coherences))
if (!termMetadataForDocuments.testCoherence(docId, searchTerms.coherences))
disqualified = true;
else if (maxFlagsCount == 0 && !anyAllSynthetic && maxPositionsSet == 0)
disqualified = true;

View File

@ -266,9 +266,7 @@ public class IndexQueryService {
var domainCountFilter = new IndexResultDomainDeduplicator(params.limitByDomain);
results.sort(Comparator.comparing(SearchResultItem::getScore).reversed()
.thenComparingInt(SearchResultItem::getRanking)
.thenComparingInt(SearchResultItem::getUrlIdInt));
results.sort(Comparator.naturalOrder());
List<SearchResultItem> resultsList = new ArrayList<>(results.size());

View File

@ -63,12 +63,13 @@ public class RankingSearchSet implements SearchSet {
}
@Override
public boolean contains(int urlId, long documentMetadata) {
public boolean contains(int domainId, long documentMetadata) {
// This is the main check
if (set.contains(urlId) || set.isEmpty()) {
if (set.contains(domainId) || set.isEmpty()) {
return true;
}
// TODO
return false;
}

View File

@ -4,7 +4,7 @@ import nu.marginalia.index.searchset.SearchSet;
public class SearchSetAny implements SearchSet {
@Override
public boolean contains(int urlId, long meta) {
public boolean contains(int domainId, long meta) {
return true;
}

View File

@ -15,6 +15,7 @@ import nu.marginalia.index.query.limit.QueryLimits;
import nu.marginalia.index.query.limit.QueryStrategy;
import nu.marginalia.index.query.limit.SpecificationLimit;
import nu.marginalia.lexicon.KeywordLexicon;
import nu.marginalia.model.id.UrlIdCodec;
import nu.marginalia.model.idx.WordFlags;
import nu.marginalia.model.idx.DocumentMetadata;
import nu.marginalia.model.idx.WordMetadata;
@ -96,12 +97,14 @@ public class IndexQueryServiceIntegrationTest {
List.of("3", "5", "2"), List.of("4"), Collections.emptyList(), Collections.emptyList(),
Collections.emptyList()))).build());
Assertions.assertArrayEquals(
new int[] { 30, 90, 150, 210, 270, 330, 390, 450, 510 },
rsp.results
.stream()
.mapToInt(SearchResultItem::getUrlIdInt)
.toArray());
int[] idxes = new int[] { 30, 510, 90, 150, 210, 270, 330, 390, 450 };
long[] ids = IntStream.of(idxes).mapToLong(this::fullId).toArray();
long[] actual = rsp.results
.stream()
.mapToLong(SearchResultItem::getDocumentId)
.toArray();
Assertions.assertArrayEquals(ids, actual);
}
@ -127,9 +130,11 @@ public class IndexQueryServiceIntegrationTest {
.subqueries(List.of(new SearchSubquery(
List.of("3", "5", "2"), List.of("4"), Collections.emptyList(), Collections.emptyList(),
Collections.emptyList()))).build());
Assertions.assertArrayEquals(
new int[] { 210, 270 },
rsp.results.stream().mapToInt(SearchResultItem::getUrlIdInt).toArray());
int[] idxes = new int[] { 210, 270 };
long[] ids = IntStream.of(idxes).mapToLong(id -> UrlIdCodec.encodeId(id/100, id)).toArray();
long[] actual = rsp.results.stream().mapToLong(SearchResultItem::getDocumentId).toArray();
Assertions.assertArrayEquals(ids, actual);
}
@Test
@ -169,13 +174,17 @@ public class IndexQueryServiceIntegrationTest {
}
private long fullId(int id) {
return UrlIdCodec.encodeId((32 - (id % 32)), id);
}
public void loadData(int id) {
int[] factors = IntStream
.rangeClosed(1, id)
.filter(v -> (id % v) == 0)
.toArray();
long fullId = id | ((long) (32 - (id % 32)) << 32);
long fullId = fullId(id);
var header = new IndexJournalEntryHeader(factors.length, 0, fullId, new DocumentMetadata(0, 0, 0, 0, id % 5, id, id % 20, (byte) 0).encode());
@ -190,7 +199,7 @@ public class IndexQueryServiceIntegrationTest {
public void loadDataWithDomain(int domain, int id) {
int[] factors = IntStream.rangeClosed(1, id).filter(v -> (id % v) == 0).toArray();
var header = new IndexJournalEntryHeader(factors.length, 0, id | ((long) domain << 32), DocumentMetadata.defaultValue());
var header = new IndexJournalEntryHeader(factors.length, 0, UrlIdCodec.encodeId(domain, id), DocumentMetadata.defaultValue());
long[] data = new long[factors.length*2];
for (int i = 0; i < factors.length; i++) {