mirror of
https://github.com/MarginaliaSearch/MarginaliaSearch.git
synced 2025-02-23 21:18:58 +00:00
(index) Implement new URL ID coding scheme.
Also refactor along the way. Really needs an additional pass, these tests are very hairy.
This commit is contained in:
parent
6a04cdfddf
commit
9894f37412
@ -4,13 +4,15 @@ import lombok.AllArgsConstructor;
|
|||||||
import lombok.Getter;
|
import lombok.Getter;
|
||||||
import nu.marginalia.model.EdgeUrl;
|
import nu.marginalia.model.EdgeUrl;
|
||||||
import nu.marginalia.model.id.EdgeId;
|
import nu.marginalia.model.id.EdgeId;
|
||||||
|
import nu.marginalia.model.id.UrlIdCodec;
|
||||||
|
import org.jetbrains.annotations.NotNull;
|
||||||
|
|
||||||
import java.util.ArrayList;
|
import java.util.ArrayList;
|
||||||
import java.util.List;
|
import java.util.List;
|
||||||
|
|
||||||
/** Represents a document matching a search query */
|
/** Represents a document matching a search query */
|
||||||
@AllArgsConstructor @Getter
|
@AllArgsConstructor @Getter
|
||||||
public class SearchResultItem {
|
public class SearchResultItem implements Comparable<SearchResultItem> {
|
||||||
/** Encoded ID that contains both the URL id and its ranking */
|
/** Encoded ID that contains both the URL id and its ranking */
|
||||||
public final long combinedId;
|
public final long combinedId;
|
||||||
|
|
||||||
@ -25,15 +27,22 @@ public class SearchResultItem {
|
|||||||
this.keywordScores = new ArrayList<>(16);
|
this.keywordScores = new ArrayList<>(16);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@Deprecated
|
||||||
public EdgeId<EdgeUrl> getUrlId() {
|
public EdgeId<EdgeUrl> getUrlId() {
|
||||||
return new EdgeId<>(getUrlIdInt());
|
return new EdgeId<>(getUrlIdInt());
|
||||||
}
|
}
|
||||||
|
|
||||||
|
public long getDocumentId() {
|
||||||
|
return UrlIdCodec.removeRank(combinedId);
|
||||||
|
}
|
||||||
|
|
||||||
|
@Deprecated
|
||||||
public int getUrlIdInt() {
|
public int getUrlIdInt() {
|
||||||
return (int)(combinedId & 0xFFFF_FFFFL);
|
return (int)(combinedId & 0xFFFF_FFFFL);
|
||||||
}
|
}
|
||||||
|
|
||||||
public int getRanking() {
|
public int getRanking() {
|
||||||
return (int)(combinedId >>> 32);
|
return UrlIdCodec.getRank(combinedId);
|
||||||
}
|
}
|
||||||
|
|
||||||
/* Used for evaluation */
|
/* Used for evaluation */
|
||||||
@ -45,16 +54,12 @@ public class SearchResultItem {
|
|||||||
return scoreValue;
|
return scoreValue;
|
||||||
}
|
}
|
||||||
|
|
||||||
private transient int domainId = Integer.MIN_VALUE;
|
|
||||||
public void setDomainId(int domainId) {
|
|
||||||
this.domainId = domainId;
|
|
||||||
}
|
|
||||||
public int getDomainId() {
|
public int getDomainId() {
|
||||||
return this.domainId;
|
return UrlIdCodec.getDomainId(this.combinedId);
|
||||||
}
|
}
|
||||||
|
|
||||||
public int hashCode() {
|
public int hashCode() {
|
||||||
return getUrlIdInt();
|
return Long.hashCode(combinedId);
|
||||||
}
|
}
|
||||||
|
|
||||||
public String toString() {
|
public String toString() {
|
||||||
@ -67,7 +72,7 @@ public class SearchResultItem {
|
|||||||
if (other == this)
|
if (other == this)
|
||||||
return true;
|
return true;
|
||||||
if (other instanceof SearchResultItem o) {
|
if (other instanceof SearchResultItem o) {
|
||||||
return o.getUrlIdInt() == getUrlIdInt();
|
return o.getDocumentId() == getDocumentId();
|
||||||
}
|
}
|
||||||
return false;
|
return false;
|
||||||
}
|
}
|
||||||
@ -81,4 +86,14 @@ public class SearchResultItem {
|
|||||||
|
|
||||||
return domainId;
|
return domainId;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public int compareTo(@NotNull SearchResultItem o) {
|
||||||
|
// this looks like a bug, but we actually want this in a reversed order
|
||||||
|
int diff = o.getScore().compareTo(getScore());
|
||||||
|
if (diff != 0)
|
||||||
|
return diff;
|
||||||
|
|
||||||
|
return Long.compare(this.combinedId, o.combinedId);
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
@ -29,33 +29,17 @@ package nu.marginalia.model.id;
|
|||||||
* </pre></code>
|
* </pre></code>
|
||||||
*/
|
*/
|
||||||
public class UrlIdCodec {
|
public class UrlIdCodec {
|
||||||
private static final long RANK_MASK = 0x8600_0000_0000_0000L;
|
private static final long RANK_MASK = 0xFE00_0000_0000_0000L;
|
||||||
private static final int DOCORD_MASK = 0x03FF_FFFF;
|
private static final int DOCORD_MASK = 0x03FF_FFFF;
|
||||||
|
|
||||||
/** Encode a URL id without a ranking element */
|
/** Encode a URL id without a ranking element */
|
||||||
public static long encodeId(int domainId, int documentOrdinal) {
|
public static long encodeId(int domainId, int documentOrdinal) {
|
||||||
domainId &= 0x7FFF_FFFFL;
|
domainId &= 0x7FFF_FFFF;
|
||||||
documentOrdinal &= 0x03FF_FFFF;
|
documentOrdinal &= 0x03FF_FFFF;
|
||||||
|
|
||||||
return ((long) domainId << 26) | documentOrdinal;
|
return ((long) domainId << 26) | documentOrdinal;
|
||||||
}
|
}
|
||||||
|
|
||||||
/** Encode a URL id with the optional ranking part
|
|
||||||
*
|
|
||||||
* @param rank [0,1] the importance of the domain, low is good
|
|
||||||
* @param domainId
|
|
||||||
* @param documentOrdinal
|
|
||||||
* @return
|
|
||||||
*/
|
|
||||||
public static long encodeIdWithRank(float rank, int domainId, int documentOrdinal) {
|
|
||||||
long rankPart = (int)(rank * (1<<6));
|
|
||||||
|
|
||||||
if (rankPart >= 64) rankPart = 63;
|
|
||||||
if (rankPart < 0) rankPart = 0;
|
|
||||||
|
|
||||||
return encodeId(domainId, documentOrdinal) | (rankPart << 57);
|
|
||||||
}
|
|
||||||
|
|
||||||
/** Add a ranking element to an existing combined URL id.
|
/** Add a ranking element to an existing combined URL id.
|
||||||
*
|
*
|
||||||
* @param rank [0,1] the importance of the domain, low is good
|
* @param rank [0,1] the importance of the domain, low is good
|
||||||
@ -88,7 +72,7 @@ public class UrlIdCodec {
|
|||||||
|
|
||||||
/** Mask out the ranking element from this URL id */
|
/** Mask out the ranking element from this URL id */
|
||||||
public static long removeRank(long combinedId) {
|
public static long removeRank(long combinedId) {
|
||||||
return combinedId & (~RANK_MASK);
|
return combinedId & ~RANK_MASK;
|
||||||
}
|
}
|
||||||
|
|
||||||
}
|
}
|
||||||
|
@ -27,6 +27,25 @@ class UrlIdCodecTest {
|
|||||||
assertEquals(0, UrlIdCodec.getDocumentOrdinal(encoded));
|
assertEquals(0, UrlIdCodec.getDocumentOrdinal(encoded));
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@Test
|
||||||
|
public void testRankBoundsAdd() {
|
||||||
|
long encoded = UrlIdCodec.encodeId(0, 0);
|
||||||
|
encoded = UrlIdCodec.addRank(1.f, encoded);
|
||||||
|
assertEquals(0, UrlIdCodec.getDomainId(encoded));
|
||||||
|
assertEquals(63, UrlIdCodec.getRank(encoded));
|
||||||
|
assertEquals(0, UrlIdCodec.getDocumentOrdinal(encoded));
|
||||||
|
}
|
||||||
|
|
||||||
|
@Test
|
||||||
|
public void testRemoveRank() {
|
||||||
|
long encoded = UrlIdCodec.encodeId(0x7FFF_FFFF, ~0);
|
||||||
|
encoded = UrlIdCodec.addRank(1.f, encoded);
|
||||||
|
encoded = UrlIdCodec.removeRank(encoded);
|
||||||
|
assertEquals(0x7FFF_FFFFL, UrlIdCodec.getDomainId(encoded));
|
||||||
|
assertEquals(0, UrlIdCodec.getRank(encoded));
|
||||||
|
assertEquals(0x03FF_FFFF, UrlIdCodec.getDocumentOrdinal(encoded));
|
||||||
|
}
|
||||||
|
|
||||||
@Test
|
@Test
|
||||||
public void testRankBoundsNeg() {
|
public void testRankBoundsNeg() {
|
||||||
long encoded = UrlIdCodec.encodeIdWithRank(-1.0f, 0, 0);
|
long encoded = UrlIdCodec.encodeIdWithRank(-1.0f, 0, 0);
|
||||||
|
@ -37,6 +37,10 @@ public class DomainRankings {
|
|||||||
return rankings.getOrDefault(domainId, (short) MAX_RANK_VALUE);
|
return rankings.getOrDefault(domainId, (short) MAX_RANK_VALUE);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
public float getSortRanking(int domainId) {
|
||||||
|
return rankings.getOrDefault(domainId, (short) MAX_RANK_VALUE) / (float) MAX_RANK_VALUE;
|
||||||
|
}
|
||||||
|
|
||||||
public int size() {
|
public int size() {
|
||||||
return rankings.size();
|
return rankings.size();
|
||||||
}
|
}
|
||||||
|
@ -10,6 +10,8 @@ import nu.marginalia.ranking.DomainRankings;
|
|||||||
import nu.marginalia.service.control.ServiceHeartbeat;
|
import nu.marginalia.service.control.ServiceHeartbeat;
|
||||||
import org.roaringbitmap.IntConsumer;
|
import org.roaringbitmap.IntConsumer;
|
||||||
import org.roaringbitmap.RoaringBitmap;
|
import org.roaringbitmap.RoaringBitmap;
|
||||||
|
import org.roaringbitmap.longlong.LongConsumer;
|
||||||
|
import org.roaringbitmap.longlong.Roaring64Bitmap;
|
||||||
import org.slf4j.Logger;
|
import org.slf4j.Logger;
|
||||||
import org.slf4j.LoggerFactory;
|
import org.slf4j.LoggerFactory;
|
||||||
|
|
||||||
@ -83,12 +85,11 @@ public class ForwardIndexConverter {
|
|||||||
LongArray docFileData = LongArray.mmapForWriting(outputFileDocsData, ForwardIndexParameters.ENTRY_SIZE * docsFileId.size());
|
LongArray docFileData = LongArray.mmapForWriting(outputFileDocsData, ForwardIndexParameters.ENTRY_SIZE * docsFileId.size());
|
||||||
|
|
||||||
journalReader.forEach(entry -> {
|
journalReader.forEach(entry -> {
|
||||||
long entryOffset = (long) ForwardIndexParameters.ENTRY_SIZE * docIdToIdx.get(entry.urlId());
|
long entryOffset = (long) ForwardIndexParameters.ENTRY_SIZE * docIdToIdx.get(entry.docId());
|
||||||
|
|
||||||
int ranking = domainRankings.getRanking(entry.domainId());
|
int ranking = domainRankings.getRanking(entry.domainId());
|
||||||
long meta = DocumentMetadata.encodeRank(entry.docMeta(), ranking);
|
long meta = DocumentMetadata.encodeRank(entry.docMeta(), ranking);
|
||||||
|
|
||||||
docFileData.set(entryOffset + ForwardIndexParameters.DOMAIN_OFFSET, entry.domainId());
|
|
||||||
docFileData.set(entryOffset + ForwardIndexParameters.METADATA_OFFSET, meta);
|
docFileData.set(entryOffset + ForwardIndexParameters.METADATA_OFFSET, meta);
|
||||||
docFileData.set(entryOffset + ForwardIndexParameters.FEATURES_OFFSET, entry.header.documentFeatures());
|
docFileData.set(entryOffset + ForwardIndexParameters.FEATURES_OFFSET, entry.header.documentFeatures());
|
||||||
});
|
});
|
||||||
@ -109,17 +110,18 @@ public class ForwardIndexConverter {
|
|||||||
}
|
}
|
||||||
|
|
||||||
private LongArray getDocIds(Path outputFileDocs, IndexJournalReader journalReader) throws IOException {
|
private LongArray getDocIds(Path outputFileDocs, IndexJournalReader journalReader) throws IOException {
|
||||||
RoaringBitmap rbm = new RoaringBitmap();
|
Roaring64Bitmap rbm = new Roaring64Bitmap();
|
||||||
journalReader.forEachUrlId(rbm::add);
|
journalReader.forEachDocId(rbm::add);
|
||||||
|
|
||||||
LongArray ret = LongArray.mmapForWriting(outputFileDocs, rbm.getCardinality());
|
LongArray ret = LongArray.mmapForWriting(outputFileDocs, rbm.getIntCardinality());
|
||||||
rbm.forEach(new IntConsumer() {
|
rbm.forEach(new LongConsumer() {
|
||||||
int offset;
|
int offset;
|
||||||
@Override
|
@Override
|
||||||
public void accept(int value) {
|
public void accept(long value) {
|
||||||
ret.set(offset++, value);
|
ret.set(offset++, value);
|
||||||
}
|
}
|
||||||
});
|
});
|
||||||
|
|
||||||
return ret;
|
return ret;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -1,9 +1,8 @@
|
|||||||
package nu.marginalia.index.forward;
|
package nu.marginalia.index.forward;
|
||||||
|
|
||||||
class ForwardIndexParameters {
|
class ForwardIndexParameters {
|
||||||
public static final int ENTRY_SIZE = 3;
|
public static final int ENTRY_SIZE = 2;
|
||||||
public static final int DOMAIN_OFFSET = 0;
|
public static final int METADATA_OFFSET = 0;
|
||||||
public static final int METADATA_OFFSET = 1;
|
public static final int FEATURES_OFFSET = 1;
|
||||||
public static final int FEATURES_OFFSET = 2;
|
|
||||||
|
|
||||||
}
|
}
|
||||||
|
@ -3,6 +3,7 @@ package nu.marginalia.index.forward;
|
|||||||
import com.upserve.uppend.blobs.NativeIO;
|
import com.upserve.uppend.blobs.NativeIO;
|
||||||
import gnu.trove.map.hash.TLongIntHashMap;
|
import gnu.trove.map.hash.TLongIntHashMap;
|
||||||
import nu.marginalia.array.LongArray;
|
import nu.marginalia.array.LongArray;
|
||||||
|
import nu.marginalia.model.id.UrlIdCodec;
|
||||||
import org.slf4j.Logger;
|
import org.slf4j.Logger;
|
||||||
import org.slf4j.LoggerFactory;
|
import org.slf4j.LoggerFactory;
|
||||||
|
|
||||||
@ -71,6 +72,8 @@ public class ForwardIndexReader {
|
|||||||
}
|
}
|
||||||
|
|
||||||
public long getDocMeta(long docId) {
|
public long getDocMeta(long docId) {
|
||||||
|
assert UrlIdCodec.getRank(docId) == 0 : "Forward Index Reader fed dirty reverse index id";
|
||||||
|
|
||||||
long offset = idxForDoc(docId);
|
long offset = idxForDoc(docId);
|
||||||
if (offset < 0) return 0;
|
if (offset < 0) return 0;
|
||||||
|
|
||||||
@ -78,20 +81,17 @@ public class ForwardIndexReader {
|
|||||||
}
|
}
|
||||||
|
|
||||||
public int getHtmlFeatures(long docId) {
|
public int getHtmlFeatures(long docId) {
|
||||||
|
assert UrlIdCodec.getRank(docId) == 0 : "Forward Index Reader fed dirty reverse index id";
|
||||||
|
|
||||||
long offset = idxForDoc(docId);
|
long offset = idxForDoc(docId);
|
||||||
if (offset < 0) return 0;
|
if (offset < 0) return 0;
|
||||||
|
|
||||||
return (int) data.get(ENTRY_SIZE * offset + FEATURES_OFFSET);
|
return (int) data.get(ENTRY_SIZE * offset + FEATURES_OFFSET);
|
||||||
}
|
}
|
||||||
|
|
||||||
public int getDomainId(long docId) {
|
|
||||||
long offset = idxForDoc(docId);
|
|
||||||
if (offset < 0) return 0;
|
|
||||||
|
|
||||||
return Math.max(0, (int) data.get(ENTRY_SIZE * offset + DOMAIN_OFFSET));
|
|
||||||
}
|
|
||||||
|
|
||||||
private int idxForDoc(long docId) {
|
private int idxForDoc(long docId) {
|
||||||
|
assert UrlIdCodec.getRank(docId) == 0 : "Forward Index Reader fed dirty reverse index id";
|
||||||
|
|
||||||
return idToOffset.get(docId);
|
return idToOffset.get(docId);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -1,5 +1,6 @@
|
|||||||
package nu.marginalia.index.forward;
|
package nu.marginalia.index.forward;
|
||||||
|
|
||||||
|
import nu.marginalia.model.id.UrlIdCodec;
|
||||||
import nu.marginalia.model.idx.DocumentMetadata;
|
import nu.marginalia.model.idx.DocumentMetadata;
|
||||||
import nu.marginalia.index.query.limit.SpecificationLimitType;
|
import nu.marginalia.index.query.limit.SpecificationLimitType;
|
||||||
import nu.marginalia.index.query.IndexQueryParams;
|
import nu.marginalia.index.query.IndexQueryParams;
|
||||||
@ -15,10 +16,11 @@ public class ParamMatchingQueryFilter implements QueryFilterStepIf {
|
|||||||
}
|
}
|
||||||
|
|
||||||
@Override
|
@Override
|
||||||
public boolean test(long docId) {
|
public boolean test(long combinedId) {
|
||||||
int urlId = (int) (docId & 0xFFFF_FFFFL);
|
long docId = UrlIdCodec.removeRank(combinedId);
|
||||||
int domainId = forwardIndexReader.getDomainId(urlId);
|
int domainId = UrlIdCodec.getDomainId(docId);
|
||||||
long meta = forwardIndexReader.getDocMeta(urlId);
|
|
||||||
|
long meta = forwardIndexReader.getDocMeta(docId);
|
||||||
|
|
||||||
if (!validateDomain(domainId, meta)) {
|
if (!validateDomain(domainId, meta)) {
|
||||||
return false;
|
return false;
|
||||||
|
@ -113,8 +113,9 @@ class ForwardIndexConverterTest {
|
|||||||
var forwardReader = new ForwardIndexReader(docsFileId, docsFileData);
|
var forwardReader = new ForwardIndexReader(docsFileId, docsFileData);
|
||||||
|
|
||||||
for (int i = 36; i < workSetSize; i++) {
|
for (int i = 36; i < workSetSize; i++) {
|
||||||
assertEquals(0x00FF000000000000L | (i % 5), forwardReader.getDocMeta(i));
|
long docId = createId(i, i/20);
|
||||||
assertEquals(i/20, forwardReader.getDomainId(i));
|
assertEquals(0x00FF000000000000L | (i % 5), forwardReader.getDocMeta(docId));
|
||||||
|
assertEquals(i/20, UrlIdCodec.getDomainId(docId));
|
||||||
}
|
}
|
||||||
|
|
||||||
}
|
}
|
||||||
|
@ -2,6 +2,7 @@ package nu.marginalia.index.journal.reader;
|
|||||||
|
|
||||||
import nu.marginalia.index.journal.model.IndexJournalEntryData;
|
import nu.marginalia.index.journal.model.IndexJournalEntryData;
|
||||||
import nu.marginalia.index.journal.model.IndexJournalEntryHeader;
|
import nu.marginalia.index.journal.model.IndexJournalEntryHeader;
|
||||||
|
import nu.marginalia.model.id.UrlIdCodec;
|
||||||
|
|
||||||
import java.io.DataInputStream;
|
import java.io.DataInputStream;
|
||||||
import java.io.IOException;
|
import java.io.IOException;
|
||||||
@ -51,11 +52,7 @@ public class IndexJournalReadEntry {
|
|||||||
}
|
}
|
||||||
|
|
||||||
public int domainId() {
|
public int domainId() {
|
||||||
return (int) (docId() >>> 32L);
|
return UrlIdCodec.getDomainId(docId());
|
||||||
}
|
|
||||||
|
|
||||||
public int urlId() {
|
|
||||||
return (int) (docId() & 0xFFFF_FFFFL);
|
|
||||||
}
|
}
|
||||||
|
|
||||||
public IndexJournalEntryData readEntry() {
|
public IndexJournalEntryData readEntry() {
|
||||||
|
@ -8,6 +8,7 @@ import org.jetbrains.annotations.NotNull;
|
|||||||
import java.io.IOException;
|
import java.io.IOException;
|
||||||
import java.util.Iterator;
|
import java.util.Iterator;
|
||||||
import java.util.function.IntConsumer;
|
import java.util.function.IntConsumer;
|
||||||
|
import java.util.function.LongConsumer;
|
||||||
|
|
||||||
public interface IndexJournalReader extends Iterable<IndexJournalReadEntry> {
|
public interface IndexJournalReader extends Iterable<IndexJournalReadEntry> {
|
||||||
int FILE_HEADER_SIZE_LONGS = 2;
|
int FILE_HEADER_SIZE_LONGS = 2;
|
||||||
@ -19,13 +20,12 @@ public interface IndexJournalReader extends Iterable<IndexJournalReadEntry> {
|
|||||||
|
|
||||||
void forEachWordId(IntConsumer consumer);
|
void forEachWordId(IntConsumer consumer);
|
||||||
|
|
||||||
void forEachUrlIdWordId(BiIntConsumer consumer);
|
|
||||||
|
|
||||||
void forEachDocIdWordId(LongIntConsumer consumer);
|
void forEachDocIdWordId(LongIntConsumer consumer);
|
||||||
|
|
||||||
void forEachDocIdRecord(LongObjectConsumer<IndexJournalEntryData.Record> consumer);
|
void forEachDocIdRecord(LongObjectConsumer<IndexJournalEntryData.Record> consumer);
|
||||||
|
|
||||||
void forEachUrlId(IntConsumer consumer);
|
void forEachDocId(LongConsumer consumer);
|
||||||
|
|
||||||
@NotNull
|
@NotNull
|
||||||
@Override
|
@Override
|
||||||
|
@ -14,6 +14,7 @@ import java.nio.file.Path;
|
|||||||
import java.nio.file.StandardOpenOption;
|
import java.nio.file.StandardOpenOption;
|
||||||
import java.util.Iterator;
|
import java.util.Iterator;
|
||||||
import java.util.function.IntConsumer;
|
import java.util.function.IntConsumer;
|
||||||
|
import java.util.function.LongConsumer;
|
||||||
import java.util.function.Predicate;
|
import java.util.function.Predicate;
|
||||||
|
|
||||||
public class IndexJournalReaderSingleCompressedFile implements IndexJournalReader {
|
public class IndexJournalReaderSingleCompressedFile implements IndexJournalReader {
|
||||||
@ -115,19 +116,6 @@ public class IndexJournalReaderSingleCompressedFile implements IndexJournalReade
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
@Override
|
|
||||||
public void forEachUrlIdWordId(BiIntConsumer consumer) {
|
|
||||||
for (var entry : this) {
|
|
||||||
var data = entry.readEntry();
|
|
||||||
|
|
||||||
for (var post : data) {
|
|
||||||
if (filter(entry, post)) {
|
|
||||||
consumer.accept(entry.urlId(), post.wordId());
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
@Override
|
@Override
|
||||||
public void forEachDocIdWordId(LongIntConsumer consumer) {
|
public void forEachDocIdWordId(LongIntConsumer consumer) {
|
||||||
for (var entry : this) {
|
for (var entry : this) {
|
||||||
@ -154,10 +142,10 @@ public class IndexJournalReaderSingleCompressedFile implements IndexJournalReade
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
@Override
|
@Override
|
||||||
public void forEachUrlId(IntConsumer consumer) {
|
public void forEachDocId(LongConsumer consumer) {
|
||||||
for (var entry : this) {
|
for (var entry : this) {
|
||||||
if (filter(entry)) {
|
if (filter(entry)) {
|
||||||
consumer.accept(entry.urlId());
|
consumer.accept(entry.docId());
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
@ -6,6 +6,7 @@ import nu.marginalia.index.journal.reader.IndexJournalReader;
|
|||||||
import nu.marginalia.index.journal.reader.IndexJournalReaderSingleCompressedFile;
|
import nu.marginalia.index.journal.reader.IndexJournalReaderSingleCompressedFile;
|
||||||
import nu.marginalia.index.journal.writer.IndexJournalWriterImpl;
|
import nu.marginalia.index.journal.writer.IndexJournalWriterImpl;
|
||||||
import nu.marginalia.lexicon.KeywordLexicon;
|
import nu.marginalia.lexicon.KeywordLexicon;
|
||||||
|
import nu.marginalia.model.id.UrlIdCodec;
|
||||||
import org.apache.commons.lang3.tuple.Pair;
|
import org.apache.commons.lang3.tuple.Pair;
|
||||||
import org.junit.jupiter.api.AfterEach;
|
import org.junit.jupiter.api.AfterEach;
|
||||||
import org.junit.jupiter.api.BeforeEach;
|
import org.junit.jupiter.api.BeforeEach;
|
||||||
@ -25,6 +26,9 @@ public class IndexJournalTest {
|
|||||||
KeywordLexicon lexicon;
|
KeywordLexicon lexicon;
|
||||||
IndexJournalReader reader;
|
IndexJournalReader reader;
|
||||||
|
|
||||||
|
long firstDocId = UrlIdCodec.encodeId(44, 10);
|
||||||
|
long secondDocId = UrlIdCodec.encodeId(43, 15);
|
||||||
|
|
||||||
@BeforeEach
|
@BeforeEach
|
||||||
public void setUp() throws IOException {
|
public void setUp() throws IOException {
|
||||||
tempFile = Files.createTempFile(getClass().getSimpleName(), ".dat");
|
tempFile = Files.createTempFile(getClass().getSimpleName(), ".dat");
|
||||||
@ -65,11 +69,11 @@ public class IndexJournalTest {
|
|||||||
}
|
}
|
||||||
|
|
||||||
@Test
|
@Test
|
||||||
public void forEachUrlId() {
|
public void forEachDocId() {
|
||||||
List<Integer> expected = List.of(10, 15);
|
List<Long> expected = List.of(firstDocId, secondDocId);
|
||||||
List<Integer> actual = new ArrayList<>();
|
List<Long> actual = new ArrayList<>();
|
||||||
|
|
||||||
reader.forEachUrlId(actual::add);
|
reader.forEachDocId(actual::add);
|
||||||
assertEquals(expected, actual);
|
assertEquals(expected, actual);
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -82,31 +86,15 @@ public class IndexJournalTest {
|
|||||||
assertEquals(expected, actual);
|
assertEquals(expected, actual);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
@Test
|
|
||||||
public void forEachUrlIdWordId() {
|
|
||||||
List<Pair<Integer, Integer>> expected = List.of(
|
|
||||||
Pair.of(10, 1),
|
|
||||||
Pair.of(10, 2),
|
|
||||||
Pair.of(10, 3),
|
|
||||||
Pair.of(10, 5),
|
|
||||||
Pair.of(15, 5),
|
|
||||||
Pair.of(15, 6));
|
|
||||||
List<Pair<Integer, Integer>> actual = new ArrayList<>();
|
|
||||||
|
|
||||||
reader.forEachUrlIdWordId((url, word) -> actual.add(Pair.of(url, word)));
|
|
||||||
assertEquals(expected, actual);
|
|
||||||
}
|
|
||||||
|
|
||||||
@Test
|
@Test
|
||||||
public void forEachDocIdWordId() {
|
public void forEachDocIdWordId() {
|
||||||
List<Pair<Long, Integer>> expected = List.of(
|
List<Pair<Long, Integer>> expected = List.of(
|
||||||
Pair.of(10L | (44L << 32), 1),
|
Pair.of(firstDocId, 1),
|
||||||
Pair.of(10L | (44L << 32), 2),
|
Pair.of(firstDocId, 2),
|
||||||
Pair.of(10L | (44L << 32), 3),
|
Pair.of(firstDocId, 3),
|
||||||
Pair.of(10L | (44L << 32), 5),
|
Pair.of(firstDocId, 5),
|
||||||
Pair.of(15L | (43L << 32), 5),
|
Pair.of(secondDocId, 5),
|
||||||
Pair.of(15L | (43L << 32), 6));
|
Pair.of(secondDocId, 6));
|
||||||
List<Pair<Long, Integer>> actual = new ArrayList<>();
|
List<Pair<Long, Integer>> actual = new ArrayList<>();
|
||||||
|
|
||||||
reader.forEachDocIdWordId((url, word) -> actual.add(Pair.of(url, word)));
|
reader.forEachDocIdWordId((url, word) -> actual.add(Pair.of(url, word)));
|
||||||
@ -116,12 +104,12 @@ public class IndexJournalTest {
|
|||||||
@Test
|
@Test
|
||||||
public void forEachDocIdRecord() {
|
public void forEachDocIdRecord() {
|
||||||
List<Pair<Long, IndexJournalEntryData.Record>> expected = List.of(
|
List<Pair<Long, IndexJournalEntryData.Record>> expected = List.of(
|
||||||
Pair.of(10L | (44L << 32), new IndexJournalEntryData.Record(1, 2)),
|
Pair.of(firstDocId, new IndexJournalEntryData.Record(1, 2)),
|
||||||
Pair.of(10L | (44L << 32), new IndexJournalEntryData.Record(2, 3)),
|
Pair.of(firstDocId, new IndexJournalEntryData.Record(2, 3)),
|
||||||
Pair.of(10L | (44L << 32), new IndexJournalEntryData.Record(3, 4)),
|
Pair.of(firstDocId, new IndexJournalEntryData.Record(3, 4)),
|
||||||
Pair.of(10L | (44L << 32), new IndexJournalEntryData.Record(5, 6)),
|
Pair.of(firstDocId, new IndexJournalEntryData.Record(5, 6)),
|
||||||
Pair.of(15L | (43L << 32), new IndexJournalEntryData.Record(5, 5)),
|
Pair.of(secondDocId, new IndexJournalEntryData.Record(5, 5)),
|
||||||
Pair.of(15L | (43L << 32), new IndexJournalEntryData.Record(6, 6))
|
Pair.of(secondDocId, new IndexJournalEntryData.Record(6, 6))
|
||||||
);
|
);
|
||||||
List<Pair<Long, IndexJournalEntryData.Record>> actual = new ArrayList<>();
|
List<Pair<Long, IndexJournalEntryData.Record>> actual = new ArrayList<>();
|
||||||
|
|
||||||
|
@ -3,10 +3,10 @@ package nu.marginalia.index.searchset;
|
|||||||
public interface SearchSet {
|
public interface SearchSet {
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Returns true if the given urlId is contained in the set
|
* Returns true if the given domainId is contained in the set
|
||||||
* or if the documentMetadata vibes with the set
|
* or if the documentMetadata vibes with the set
|
||||||
*
|
*
|
||||||
*/
|
*/
|
||||||
boolean contains(int urlId, long documentMetadata);
|
boolean contains(int domainId, long documentMetadata);
|
||||||
|
|
||||||
}
|
}
|
||||||
|
@ -7,6 +7,7 @@ import nu.marginalia.index.construction.IndexSizeEstimator;
|
|||||||
import nu.marginalia.index.journal.model.IndexJournalEntryData;
|
import nu.marginalia.index.journal.model.IndexJournalEntryData;
|
||||||
import nu.marginalia.index.journal.model.IndexJournalStatistics;
|
import nu.marginalia.index.journal.model.IndexJournalStatistics;
|
||||||
import nu.marginalia.index.journal.reader.IndexJournalReader;
|
import nu.marginalia.index.journal.reader.IndexJournalReader;
|
||||||
|
import nu.marginalia.model.id.UrlIdCodec;
|
||||||
import nu.marginalia.ranking.DomainRankings;
|
import nu.marginalia.ranking.DomainRankings;
|
||||||
import nu.marginalia.rwf.RandomWriteFunnel;
|
import nu.marginalia.rwf.RandomWriteFunnel;
|
||||||
import nu.marginalia.array.IntArray;
|
import nu.marginalia.array.IntArray;
|
||||||
@ -179,21 +180,9 @@ public class ReverseIndexFullConverter {
|
|||||||
@SneakyThrows
|
@SneakyThrows
|
||||||
@Override
|
@Override
|
||||||
public void accept(long docId, IndexJournalEntryData.Record record) {
|
public void accept(long docId, IndexJournalEntryData.Record record) {
|
||||||
|
int domainId = UrlIdCodec.getDomainId(docId);
|
||||||
/* Encode the ID as
|
float rankingPart = domainRankings.getSortRanking(domainId);
|
||||||
*
|
long rankEncodedId = UrlIdCodec.addRank(rankingPart, docId);
|
||||||
* 32 bits 32 bits
|
|
||||||
* [ ranking | url-id ]
|
|
||||||
*
|
|
||||||
* in order to get low-ranking documents to be considered first
|
|
||||||
* when sorting the items.
|
|
||||||
*/
|
|
||||||
|
|
||||||
int domainId = (int) (docId >>> 32);
|
|
||||||
long rankingId = (long) domainRankings.getRanking(domainId) << 32;
|
|
||||||
|
|
||||||
int urlId = (int) (docId & 0xFFFF_FFFFL);
|
|
||||||
long rankEncodedId = rankingId | urlId;
|
|
||||||
|
|
||||||
final int wordId = record.wordId();
|
final int wordId = record.wordId();
|
||||||
long offset = startOfRange(wordId);
|
long offset = startOfRange(wordId);
|
||||||
|
@ -111,10 +111,23 @@ public class ReverseIndexFullReader {
|
|||||||
return new long[docIds.length];
|
return new long[docIds.length];
|
||||||
}
|
}
|
||||||
|
|
||||||
Arrays.sort(docIds);
|
assert isSorted(docIds) : "The input array docIds is assumed to be sorted";
|
||||||
|
|
||||||
var reader = createReaderNew(offset);
|
var reader = createReaderNew(offset);
|
||||||
return reader.queryData(docIds, 1);
|
return reader.queryData(docIds, 1);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
private boolean isSorted(long[] ids) {
|
||||||
|
if (ids.length == 0)
|
||||||
|
return true;
|
||||||
|
long prev = ids[0];
|
||||||
|
|
||||||
|
for (int i = 1; i < ids.length; i++) {
|
||||||
|
if(ids[i] <= prev)
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
|
||||||
|
return true;
|
||||||
|
}
|
||||||
|
|
||||||
}
|
}
|
||||||
|
@ -10,6 +10,7 @@ import nu.marginalia.index.construction.IndexSizeEstimator;
|
|||||||
import nu.marginalia.index.journal.model.IndexJournalEntryData;
|
import nu.marginalia.index.journal.model.IndexJournalEntryData;
|
||||||
import nu.marginalia.index.journal.model.IndexJournalStatistics;
|
import nu.marginalia.index.journal.model.IndexJournalStatistics;
|
||||||
import nu.marginalia.index.journal.reader.IndexJournalReader;
|
import nu.marginalia.index.journal.reader.IndexJournalReader;
|
||||||
|
import nu.marginalia.model.id.UrlIdCodec;
|
||||||
import nu.marginalia.ranking.DomainRankings;
|
import nu.marginalia.ranking.DomainRankings;
|
||||||
import nu.marginalia.rwf.RandomWriteFunnel;
|
import nu.marginalia.rwf.RandomWriteFunnel;
|
||||||
import nu.marginalia.service.control.ServiceHeartbeat;
|
import nu.marginalia.service.control.ServiceHeartbeat;
|
||||||
@ -178,21 +179,9 @@ public class ReverseIndexPriorityConverter {
|
|||||||
@SneakyThrows
|
@SneakyThrows
|
||||||
@Override
|
@Override
|
||||||
public void accept(long docId, IndexJournalEntryData.Record record) {
|
public void accept(long docId, IndexJournalEntryData.Record record) {
|
||||||
|
int domainId = UrlIdCodec.getDomainId(docId);
|
||||||
/* Encode the ID as
|
float rankingPart = domainRankings.getSortRanking(domainId);
|
||||||
*
|
long rankEncodedId = UrlIdCodec.addRank(rankingPart, docId);
|
||||||
* 32 bits 32 bits
|
|
||||||
* [ ranking | url-id ]
|
|
||||||
*
|
|
||||||
* in order to get low-ranking documents to be considered first
|
|
||||||
* when sorting the items.
|
|
||||||
*/
|
|
||||||
|
|
||||||
int domainId = (int) (docId >>> 32);
|
|
||||||
long rankingId = (long) domainRankings.getRanking(domainId) << 32;
|
|
||||||
|
|
||||||
int urlId = (int) (docId & 0xFFFF_FFFFL);
|
|
||||||
long rankEncodedId = rankingId | urlId;
|
|
||||||
|
|
||||||
final int wordId = record.wordId();
|
final int wordId = record.wordId();
|
||||||
long offset = startOfRange(wordId);
|
long offset = startOfRange(wordId);
|
||||||
|
@ -9,6 +9,7 @@ import nu.marginalia.index.journal.reader.IndexJournalReaderSingleCompressedFile
|
|||||||
import nu.marginalia.index.journal.writer.IndexJournalWriterImpl;
|
import nu.marginalia.index.journal.writer.IndexJournalWriterImpl;
|
||||||
import nu.marginalia.index.journal.writer.IndexJournalWriter;
|
import nu.marginalia.index.journal.writer.IndexJournalWriter;
|
||||||
import nu.marginalia.lexicon.journal.KeywordLexiconJournalMode;
|
import nu.marginalia.lexicon.journal.KeywordLexiconJournalMode;
|
||||||
|
import nu.marginalia.model.id.UrlIdCodec;
|
||||||
import nu.marginalia.ranking.DomainRankings;
|
import nu.marginalia.ranking.DomainRankings;
|
||||||
import nu.marginalia.lexicon.KeywordLexicon;
|
import nu.marginalia.lexicon.KeywordLexicon;
|
||||||
import nu.marginalia.lexicon.journal.KeywordLexiconJournal;
|
import nu.marginalia.lexicon.journal.KeywordLexiconJournal;
|
||||||
@ -113,17 +114,17 @@ class ReverseIndexFullConverterTest {
|
|||||||
|
|
||||||
var buffer = new LongQueryBuffer(32);
|
var buffer = new LongQueryBuffer(32);
|
||||||
reverseIndexReader.documents(keywordLexicon.getReadOnly("1")).read(buffer);
|
reverseIndexReader.documents(keywordLexicon.getReadOnly("1")).read(buffer);
|
||||||
assertArrayEquals(LongStream.range(1, 17).map(v -> v | (255L << 32)).toArray(), buffer.copyData());
|
assertArrayEquals(LongStream.range(1, 17).map(this::addMaxRank).toArray(), buffer.copyData());
|
||||||
System.out.println(buffer);
|
System.out.println(buffer);
|
||||||
|
|
||||||
buffer.reset();
|
buffer.reset();
|
||||||
reverseIndexReader.documents(keywordLexicon.getReadOnly("2")).read(buffer);
|
reverseIndexReader.documents(keywordLexicon.getReadOnly("2")).read(buffer);
|
||||||
assertArrayEquals(LongStream.range(1, 17).map(v -> v*2).map(v -> v | (255L << 32)).toArray(), buffer.copyData());
|
assertArrayEquals(LongStream.range(1, 17).map(v -> v*2).map(this::addMaxRank).toArray(), buffer.copyData());
|
||||||
System.out.println(buffer);
|
System.out.println(buffer);
|
||||||
|
|
||||||
buffer.reset();
|
buffer.reset();
|
||||||
reverseIndexReader.documents(keywordLexicon.getReadOnly("3")).read(buffer);
|
reverseIndexReader.documents(keywordLexicon.getReadOnly("3")).read(buffer);
|
||||||
assertArrayEquals(LongStream.range(1, 17).map(v -> v*3).map(v -> v | (255L << 32)).toArray(), buffer.copyData());
|
assertArrayEquals(LongStream.range(1, 17).map(v -> v*3).map(this::addMaxRank).toArray(), buffer.copyData());
|
||||||
System.out.println(buffer);
|
System.out.println(buffer);
|
||||||
|
|
||||||
buffer.reset();
|
buffer.reset();
|
||||||
@ -137,4 +138,9 @@ class ReverseIndexFullConverterTest {
|
|||||||
|
|
||||||
TestUtil.clearTempDir(dataDir);
|
TestUtil.clearTempDir(dataDir);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// Add a max domain rank component to the input, when interpreted as an ID
|
||||||
|
private long addMaxRank(long in) {
|
||||||
|
return UrlIdCodec.addRank(1f, in);
|
||||||
|
}
|
||||||
}
|
}
|
@ -11,6 +11,7 @@ import nu.marginalia.index.journal.writer.IndexJournalWriterImpl;
|
|||||||
import nu.marginalia.index.journal.writer.IndexJournalWriter;
|
import nu.marginalia.index.journal.writer.IndexJournalWriter;
|
||||||
import nu.marginalia.index.priority.ReverseIndexPriorityParameters;
|
import nu.marginalia.index.priority.ReverseIndexPriorityParameters;
|
||||||
import nu.marginalia.lexicon.journal.KeywordLexiconJournalMode;
|
import nu.marginalia.lexicon.journal.KeywordLexiconJournalMode;
|
||||||
|
import nu.marginalia.model.id.UrlIdCodec;
|
||||||
import nu.marginalia.ranking.DomainRankings;
|
import nu.marginalia.ranking.DomainRankings;
|
||||||
import nu.marginalia.lexicon.KeywordLexicon;
|
import nu.marginalia.lexicon.KeywordLexicon;
|
||||||
import nu.marginalia.lexicon.journal.KeywordLexiconJournal;
|
import nu.marginalia.lexicon.journal.KeywordLexiconJournal;
|
||||||
@ -101,8 +102,8 @@ class ReverseIndexFullConverterTest2 {
|
|||||||
return LongStream.rangeClosed(1, id).filter(v -> (id % v) == 0).toArray();
|
return LongStream.rangeClosed(1, id).filter(v -> (id % v) == 0).toArray();
|
||||||
}
|
}
|
||||||
|
|
||||||
long createId(long url, long domain) {
|
long createId(int url, int domain) {
|
||||||
return (domain << 32) | url;
|
return UrlIdCodec.encodeId(domain, url);
|
||||||
}
|
}
|
||||||
public void createEntry(IndexJournalWriter writer, KeywordLexicon keywordLexicon, int id) {
|
public void createEntry(IndexJournalWriter writer, KeywordLexicon keywordLexicon, int id) {
|
||||||
int[] factors = getFactorsI(id);
|
int[] factors = getFactorsI(id);
|
||||||
|
@ -13,6 +13,7 @@ import nu.marginalia.index.priority.ReverseIndexPriorityParameters;
|
|||||||
import nu.marginalia.lexicon.KeywordLexicon;
|
import nu.marginalia.lexicon.KeywordLexicon;
|
||||||
import nu.marginalia.lexicon.journal.KeywordLexiconJournal;
|
import nu.marginalia.lexicon.journal.KeywordLexiconJournal;
|
||||||
import nu.marginalia.lexicon.journal.KeywordLexiconJournalMode;
|
import nu.marginalia.lexicon.journal.KeywordLexiconJournalMode;
|
||||||
|
import nu.marginalia.model.id.UrlIdCodec;
|
||||||
import nu.marginalia.ranking.DomainRankings;
|
import nu.marginalia.ranking.DomainRankings;
|
||||||
import nu.marginalia.service.control.ServiceHeartbeat;
|
import nu.marginalia.service.control.ServiceHeartbeat;
|
||||||
import nu.marginalia.service.control.ServiceTaskHeartbeat;
|
import nu.marginalia.service.control.ServiceTaskHeartbeat;
|
||||||
@ -101,8 +102,8 @@ class ReverseIndexPriorityConverterTest2 {
|
|||||||
return LongStream.rangeClosed(1, id).filter(v -> (id % v) == 0).toArray();
|
return LongStream.rangeClosed(1, id).filter(v -> (id % v) == 0).toArray();
|
||||||
}
|
}
|
||||||
|
|
||||||
long createId(long url, long domain) {
|
long createId(int url, int domain) {
|
||||||
return (domain << 32) | url;
|
return UrlIdCodec.encodeId(domain, url);
|
||||||
}
|
}
|
||||||
public void createEntry(IndexJournalWriter writer, KeywordLexicon keywordLexicon, int id) {
|
public void createEntry(IndexJournalWriter writer, KeywordLexicon keywordLexicon, int id) {
|
||||||
int[] factors = getFactorsI(id);
|
int[] factors = getFactorsI(id);
|
||||||
|
@ -188,7 +188,11 @@ public class SearchIndex {
|
|||||||
indexReader.numHitsPrio(b)
|
indexReader.numHitsPrio(b)
|
||||||
);
|
);
|
||||||
}
|
}
|
||||||
/** Replaces the values of ids with their associated metadata, or 0L if absent */
|
|
||||||
|
/** Return an array of encoded document metadata longs corresponding to the
|
||||||
|
* document identifiers provided; with metadata for termId. The input array
|
||||||
|
* docs[] *must* be sorted.
|
||||||
|
*/
|
||||||
public long[] getTermMetadata(int termId, long[] docs) {
|
public long[] getTermMetadata(int termId, long[] docs) {
|
||||||
return indexReader.getMetadata(termId, docs);
|
return indexReader.getMetadata(termId, docs);
|
||||||
}
|
}
|
||||||
@ -200,10 +204,6 @@ public class SearchIndex {
|
|||||||
return indexReader.getHtmlFeatures(docId);
|
return indexReader.getHtmlFeatures(docId);
|
||||||
}
|
}
|
||||||
|
|
||||||
public int getDomainId(long docId) {
|
|
||||||
return indexReader.getDomainId(docId);
|
|
||||||
}
|
|
||||||
|
|
||||||
public int getTotalDocCount() {
|
public int getTotalDocCount() {
|
||||||
return indexReader.totalDocCount();
|
return indexReader.totalDocCount();
|
||||||
}
|
}
|
||||||
|
@ -60,10 +60,6 @@ public class SearchIndexReader {
|
|||||||
return forwardIndexReader.getDocMeta(docId);
|
return forwardIndexReader.getDocMeta(docId);
|
||||||
}
|
}
|
||||||
|
|
||||||
public int getDomainId(long docId) {
|
|
||||||
return forwardIndexReader.getDomainId(docId);
|
|
||||||
}
|
|
||||||
|
|
||||||
public int totalDocCount() {
|
public int totalDocCount() {
|
||||||
return forwardIndexReader.totalDocCount();
|
return forwardIndexReader.totalDocCount();
|
||||||
}
|
}
|
||||||
|
@ -3,6 +3,7 @@ package nu.marginalia.index.results;
|
|||||||
import com.google.inject.Inject;
|
import com.google.inject.Inject;
|
||||||
import gnu.trove.map.hash.TObjectIntHashMap;
|
import gnu.trove.map.hash.TObjectIntHashMap;
|
||||||
import gnu.trove.set.hash.TLongHashSet;
|
import gnu.trove.set.hash.TLongHashSet;
|
||||||
|
import it.unimi.dsi.fastutil.ints.Int2ObjectArrayMap;
|
||||||
import it.unimi.dsi.fastutil.ints.IntArrayList;
|
import it.unimi.dsi.fastutil.ints.IntArrayList;
|
||||||
import it.unimi.dsi.fastutil.longs.Long2LongOpenHashMap;
|
import it.unimi.dsi.fastutil.longs.Long2LongOpenHashMap;
|
||||||
import nu.marginalia.index.client.model.query.SearchSubquery;
|
import nu.marginalia.index.client.model.query.SearchSubquery;
|
||||||
@ -18,7 +19,6 @@ import java.util.OptionalInt;
|
|||||||
public class IndexMetadataService {
|
public class IndexMetadataService {
|
||||||
private final SearchIndex index;
|
private final SearchIndex index;
|
||||||
private final SearchTermsService searchTermsService;
|
private final SearchTermsService searchTermsService;
|
||||||
|
|
||||||
private final ResultValuator searchResultValuator;
|
private final ResultValuator searchResultValuator;
|
||||||
|
|
||||||
@Inject
|
@Inject
|
||||||
@ -30,34 +30,16 @@ public class IndexMetadataService {
|
|||||||
this.searchResultValuator = searchResultValuator;
|
this.searchResultValuator = searchResultValuator;
|
||||||
}
|
}
|
||||||
|
|
||||||
public long getDocumentMetadata(long urlId) {
|
public long getDocumentMetadata(long docId) {
|
||||||
return index.getDocumentMetadata(urlId);
|
return index.getDocumentMetadata(docId);
|
||||||
}
|
}
|
||||||
|
|
||||||
public int getHtmlFeatures(long urlId) {
|
public int getHtmlFeatures(long urlId) {
|
||||||
return index.getHtmlFeatures(urlId);
|
return index.getHtmlFeatures(urlId);
|
||||||
}
|
}
|
||||||
|
|
||||||
public int getDomainId(long urlId) {
|
public TermMetadataForDocuments getTermMetadataForDocuments(long[] docIdsAll, int[] termIdsList) {
|
||||||
return index.getDomainId(urlId);
|
return new TermMetadataForDocuments(docIdsAll, termIdsList);
|
||||||
}
|
|
||||||
|
|
||||||
public long[] getTermMetadata(int termId, long[] docIdsAll) {
|
|
||||||
return index.getTermMetadata(termId, docIdsAll);
|
|
||||||
}
|
|
||||||
|
|
||||||
public TermMetadata getTermMetadata(long[] docIdsAll, int[] termIdsList) {
|
|
||||||
var termdocToMeta = new Long2LongOpenHashMap(docIdsAll.length * termIdsList.length, 0.5f);
|
|
||||||
|
|
||||||
for (int term : termIdsList) {
|
|
||||||
var metadata = getTermMetadata(term, docIdsAll);
|
|
||||||
|
|
||||||
for (int i = 0; i < docIdsAll.length; i++) {
|
|
||||||
termdocToMeta.put(termdocKey(term, docIdsAll[i]), metadata[i]);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
return new TermMetadata(termdocToMeta);
|
|
||||||
}
|
}
|
||||||
|
|
||||||
public QuerySearchTerms getSearchTerms(List<SearchSubquery> searchTermVariants) {
|
public QuerySearchTerms getSearchTerms(List<SearchSubquery> searchTermVariants) {
|
||||||
@ -80,7 +62,6 @@ public class IndexMetadataService {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
return new QuerySearchTerms(termToId,
|
return new QuerySearchTerms(termToId,
|
||||||
termIdsList.toIntArray(),
|
termIdsList.toIntArray(),
|
||||||
getTermCoherences(searchTermVariants));
|
getTermCoherences(searchTermVariants));
|
||||||
@ -92,7 +73,10 @@ public class IndexMetadataService {
|
|||||||
|
|
||||||
for (var subquery : searchTermVariants) {
|
for (var subquery : searchTermVariants) {
|
||||||
for (var coh : subquery.searchTermCoherences) {
|
for (var coh : subquery.searchTermCoherences) {
|
||||||
int[] ids = coh.stream().map(searchTermsService::lookUpWord).filter(OptionalInt::isPresent).mapToInt(OptionalInt::getAsInt).toArray();
|
int[] ids = coh.stream().map(searchTermsService::lookUpWord)
|
||||||
|
.filter(OptionalInt::isPresent)
|
||||||
|
.mapToInt(OptionalInt::getAsInt)
|
||||||
|
.toArray();
|
||||||
coherences.add(ids);
|
coherences.add(ids);
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -116,30 +100,43 @@ public class IndexMetadataService {
|
|||||||
var ret = new TLongHashSet(resultsArray.length);
|
var ret = new TLongHashSet(resultsArray.length);
|
||||||
|
|
||||||
for (int priorityTerm : priorityTermIds) {
|
for (int priorityTerm : priorityTermIds) {
|
||||||
long[] metadata = getTermMetadata(priorityTerm, resultsArray);
|
long[] metadata = index.getTermMetadata(priorityTerm, resultsArray);
|
||||||
for (int i = 0; i < metadata.length; i++) {
|
for (int i = 0; i < metadata.length; i++) {
|
||||||
if (metadata[i] != 0) ret.add(resultsArray[i]);
|
if (metadata[i] != 0) ret.add(resultsArray[i]);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
return ret;
|
return ret;
|
||||||
|
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|
||||||
public ResultValuator getSearchResultValuator() {
|
public ResultValuator getSearchResultValuator() {
|
||||||
return searchResultValuator;
|
return searchResultValuator;
|
||||||
}
|
}
|
||||||
|
|
||||||
public static class TermMetadata {
|
public class TermMetadataForDocuments {
|
||||||
private final Long2LongOpenHashMap termdocToMeta;
|
private final Int2ObjectArrayMap<Long2LongOpenHashMap> termdocToMeta;
|
||||||
|
|
||||||
public TermMetadata(Long2LongOpenHashMap termdocToMeta) {
|
public TermMetadataForDocuments(long[] docIdsAll, int[] termIdsList) {
|
||||||
this.termdocToMeta = termdocToMeta;
|
termdocToMeta = new Int2ObjectArrayMap<>(termIdsList.length);
|
||||||
|
|
||||||
|
for (int termId : termIdsList) {
|
||||||
|
var mapForTerm = new Long2LongOpenHashMap(docIdsAll.length);
|
||||||
|
|
||||||
|
var metadata = index.getTermMetadata(termId, docIdsAll);
|
||||||
|
for (int i = 0; i < docIdsAll.length; i++) {
|
||||||
|
mapForTerm.put(docIdsAll[i], metadata[i]);
|
||||||
|
}
|
||||||
|
|
||||||
|
termdocToMeta.put(termId, mapForTerm);
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
public long getTermMetadata(int termId, long docId) {
|
public long getTermMetadata(int termId, long docId) {
|
||||||
return termdocToMeta.getOrDefault(termdocKey(termId, docId), 0);
|
var docsForTerm = termdocToMeta.get(termId);
|
||||||
|
if (docsForTerm == null) {
|
||||||
|
return 0;
|
||||||
|
}
|
||||||
|
return docsForTerm.getOrDefault(docId, 0);
|
||||||
}
|
}
|
||||||
|
|
||||||
public boolean testCoherence(long docId, TermCoherences coherences) {
|
public boolean testCoherence(long docId, TermCoherences coherences) {
|
||||||
@ -164,20 +161,19 @@ public class IndexMetadataService {
|
|||||||
|
|
||||||
public final TermCoherences coherences;
|
public final TermCoherences coherences;
|
||||||
|
|
||||||
public QuerySearchTerms(TObjectIntHashMap<String> termToId, int[] termIdsAll, TermCoherences coherences) {
|
public QuerySearchTerms(TObjectIntHashMap<String> termToId,
|
||||||
|
int[] termIdsAll,
|
||||||
|
TermCoherences coherences) {
|
||||||
this.termToId = termToId;
|
this.termToId = termToId;
|
||||||
this.termIdsAll = termIdsAll;
|
this.termIdsAll = termIdsAll;
|
||||||
this.coherences = coherences;
|
this.coherences = coherences;
|
||||||
}
|
}
|
||||||
|
|
||||||
public int get(String searchTerm) {
|
public int getIdForTerm(String searchTerm) {
|
||||||
return termToId.get(searchTerm);
|
return termToId.get(searchTerm);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/** wordIds that we require to be in the same sentence */
|
||||||
public record TermCoherences(List<int[]> words) {}
|
public record TermCoherences(List<int[]> words) {}
|
||||||
|
|
||||||
private static long termdocKey(int termId, long docId) {
|
|
||||||
return (docId << 32) | Integer.toUnsignedLong(termId);
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
|
@ -13,6 +13,7 @@ import nu.marginalia.index.client.model.query.SearchSubquery;
|
|||||||
import nu.marginalia.index.query.IndexQueryParams;
|
import nu.marginalia.index.query.IndexQueryParams;
|
||||||
import nu.marginalia.ranking.ResultValuator;
|
import nu.marginalia.ranking.ResultValuator;
|
||||||
|
|
||||||
|
import java.util.Arrays;
|
||||||
import java.util.List;
|
import java.util.List;
|
||||||
|
|
||||||
public class IndexResultValuator {
|
public class IndexResultValuator {
|
||||||
@ -21,7 +22,7 @@ public class IndexResultValuator {
|
|||||||
private final IndexQueryParams queryParams;
|
private final IndexQueryParams queryParams;
|
||||||
private final TLongHashSet resultsWithPriorityTerms;
|
private final TLongHashSet resultsWithPriorityTerms;
|
||||||
|
|
||||||
private final IndexMetadataService.TermMetadata termMetadata;
|
private final IndexMetadataService.TermMetadataForDocuments termMetadataForDocuments;
|
||||||
private final IndexMetadataService.QuerySearchTerms searchTerms;
|
private final IndexMetadataService.QuerySearchTerms searchTerms;
|
||||||
|
|
||||||
private final ResultRankingContext rankingContext;
|
private final ResultRankingContext rankingContext;
|
||||||
@ -36,16 +37,17 @@ public class IndexResultValuator {
|
|||||||
this.rankingContext = rankingContext;
|
this.rankingContext = rankingContext;
|
||||||
this.searchResultValuator = metadataService.getSearchResultValuator();
|
this.searchResultValuator = metadataService.getSearchResultValuator();
|
||||||
|
|
||||||
final long[] resultsArray = results.toArray();
|
final long[] ids = results.toArray();
|
||||||
|
Arrays.sort(ids);
|
||||||
|
|
||||||
this.searchTermVariants = subqueries.stream().map(sq -> sq.searchTermsInclude).distinct().toList();
|
this.searchTermVariants = subqueries.stream().map(sq -> sq.searchTermsInclude).distinct().toList();
|
||||||
this.queryParams = queryParams;
|
this.queryParams = queryParams;
|
||||||
this.metadataService = metadataService;
|
this.metadataService = metadataService;
|
||||||
|
|
||||||
this.searchTerms = metadataService.getSearchTerms(subqueries);
|
this.searchTerms = metadataService.getSearchTerms(subqueries);
|
||||||
this.termMetadata = metadataService.getTermMetadata(results.toArray(), searchTerms.termIdsAll);
|
this.termMetadataForDocuments = metadataService.getTermMetadataForDocuments(ids, searchTerms.termIdsAll);
|
||||||
|
|
||||||
resultsWithPriorityTerms = metadataService.getResultsWithPriorityTerms(subqueries, resultsArray);
|
resultsWithPriorityTerms = metadataService.getResultsWithPriorityTerms(subqueries, ids);
|
||||||
}
|
}
|
||||||
|
|
||||||
private final long flagsFilterMask =
|
private final long flagsFilterMask =
|
||||||
@ -54,12 +56,10 @@ public class IndexResultValuator {
|
|||||||
public SearchResultItem calculatePreliminaryScore(long id) {
|
public SearchResultItem calculatePreliminaryScore(long id) {
|
||||||
|
|
||||||
SearchResultItem searchResult = new SearchResultItem(id);
|
SearchResultItem searchResult = new SearchResultItem(id);
|
||||||
final long urlIdInt = searchResult.getUrlIdInt();
|
final long docId = searchResult.getDocumentId();
|
||||||
|
|
||||||
searchResult.setDomainId(metadataService.getDomainId(urlIdInt));
|
long docMetadata = metadataService.getDocumentMetadata(docId);
|
||||||
|
int htmlFeatures = metadataService.getHtmlFeatures(docId);
|
||||||
long docMetadata = metadataService.getDocumentMetadata(urlIdInt);
|
|
||||||
int htmlFeatures = metadataService.getHtmlFeatures(urlIdInt);
|
|
||||||
|
|
||||||
int maxFlagsCount = 0;
|
int maxFlagsCount = 0;
|
||||||
boolean anyAllSynthetic = false;
|
boolean anyAllSynthetic = false;
|
||||||
@ -76,21 +76,21 @@ public class IndexResultValuator {
|
|||||||
for (int termIdx = 0; termIdx < termList.size(); termIdx++) {
|
for (int termIdx = 0; termIdx < termList.size(); termIdx++) {
|
||||||
String searchTerm = termList.get(termIdx);
|
String searchTerm = termList.get(termIdx);
|
||||||
|
|
||||||
long metadata = termMetadata.getTermMetadata(
|
long termMetadata = termMetadataForDocuments.getTermMetadata(
|
||||||
searchTerms.get(searchTerm),
|
searchTerms.getIdForTerm(searchTerm),
|
||||||
searchResult.getUrlIdInt()
|
searchResult.combinedId
|
||||||
);
|
);
|
||||||
|
|
||||||
var score = new SearchResultKeywordScore(
|
var score = new SearchResultKeywordScore(
|
||||||
querySetId,
|
querySetId,
|
||||||
searchTerm,
|
searchTerm,
|
||||||
metadata,
|
termMetadata,
|
||||||
docMetadata,
|
docMetadata,
|
||||||
htmlFeatures,
|
htmlFeatures,
|
||||||
resultsWithPriorityTerms.contains(searchResult.combinedId)
|
resultsWithPriorityTerms.contains(searchResult.combinedId)
|
||||||
);
|
);
|
||||||
|
|
||||||
synthetic &= WordFlags.Synthetic.isPresent(metadata);
|
synthetic &= WordFlags.Synthetic.isPresent(termMetadata);
|
||||||
|
|
||||||
searchResult.keywordScores.add(score);
|
searchResult.keywordScores.add(score);
|
||||||
|
|
||||||
@ -117,11 +117,13 @@ public class IndexResultValuator {
|
|||||||
|
|
||||||
final boolean hasPriorityTerm = resultsWithPriorityTerms.contains(id);
|
final boolean hasPriorityTerm = resultsWithPriorityTerms.contains(id);
|
||||||
|
|
||||||
double score = searchResultValuator.calculateSearchResultValue(searchResult.keywordScores, 5000, rankingContext);
|
double score = searchResultValuator.calculateSearchResultValue(searchResult.keywordScores,
|
||||||
|
5000,
|
||||||
|
rankingContext);
|
||||||
|
|
||||||
boolean disqualified = false;
|
boolean disqualified = false;
|
||||||
|
|
||||||
if (!termMetadata.testCoherence(urlIdInt, searchTerms.coherences))
|
if (!termMetadataForDocuments.testCoherence(docId, searchTerms.coherences))
|
||||||
disqualified = true;
|
disqualified = true;
|
||||||
else if (maxFlagsCount == 0 && !anyAllSynthetic && maxPositionsSet == 0)
|
else if (maxFlagsCount == 0 && !anyAllSynthetic && maxPositionsSet == 0)
|
||||||
disqualified = true;
|
disqualified = true;
|
||||||
|
@ -266,9 +266,7 @@ public class IndexQueryService {
|
|||||||
|
|
||||||
var domainCountFilter = new IndexResultDomainDeduplicator(params.limitByDomain);
|
var domainCountFilter = new IndexResultDomainDeduplicator(params.limitByDomain);
|
||||||
|
|
||||||
results.sort(Comparator.comparing(SearchResultItem::getScore).reversed()
|
results.sort(Comparator.naturalOrder());
|
||||||
.thenComparingInt(SearchResultItem::getRanking)
|
|
||||||
.thenComparingInt(SearchResultItem::getUrlIdInt));
|
|
||||||
|
|
||||||
List<SearchResultItem> resultsList = new ArrayList<>(results.size());
|
List<SearchResultItem> resultsList = new ArrayList<>(results.size());
|
||||||
|
|
||||||
|
@ -63,12 +63,13 @@ public class RankingSearchSet implements SearchSet {
|
|||||||
}
|
}
|
||||||
|
|
||||||
@Override
|
@Override
|
||||||
public boolean contains(int urlId, long documentMetadata) {
|
public boolean contains(int domainId, long documentMetadata) {
|
||||||
|
|
||||||
// This is the main check
|
// This is the main check
|
||||||
if (set.contains(urlId) || set.isEmpty()) {
|
if (set.contains(domainId) || set.isEmpty()) {
|
||||||
return true;
|
return true;
|
||||||
}
|
}
|
||||||
|
|
||||||
// TODO
|
// TODO
|
||||||
return false;
|
return false;
|
||||||
}
|
}
|
||||||
|
@ -4,7 +4,7 @@ import nu.marginalia.index.searchset.SearchSet;
|
|||||||
|
|
||||||
public class SearchSetAny implements SearchSet {
|
public class SearchSetAny implements SearchSet {
|
||||||
@Override
|
@Override
|
||||||
public boolean contains(int urlId, long meta) {
|
public boolean contains(int domainId, long meta) {
|
||||||
return true;
|
return true;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -15,6 +15,7 @@ import nu.marginalia.index.query.limit.QueryLimits;
|
|||||||
import nu.marginalia.index.query.limit.QueryStrategy;
|
import nu.marginalia.index.query.limit.QueryStrategy;
|
||||||
import nu.marginalia.index.query.limit.SpecificationLimit;
|
import nu.marginalia.index.query.limit.SpecificationLimit;
|
||||||
import nu.marginalia.lexicon.KeywordLexicon;
|
import nu.marginalia.lexicon.KeywordLexicon;
|
||||||
|
import nu.marginalia.model.id.UrlIdCodec;
|
||||||
import nu.marginalia.model.idx.WordFlags;
|
import nu.marginalia.model.idx.WordFlags;
|
||||||
import nu.marginalia.model.idx.DocumentMetadata;
|
import nu.marginalia.model.idx.DocumentMetadata;
|
||||||
import nu.marginalia.model.idx.WordMetadata;
|
import nu.marginalia.model.idx.WordMetadata;
|
||||||
@ -96,12 +97,14 @@ public class IndexQueryServiceIntegrationTest {
|
|||||||
List.of("3", "5", "2"), List.of("4"), Collections.emptyList(), Collections.emptyList(),
|
List.of("3", "5", "2"), List.of("4"), Collections.emptyList(), Collections.emptyList(),
|
||||||
Collections.emptyList()))).build());
|
Collections.emptyList()))).build());
|
||||||
|
|
||||||
Assertions.assertArrayEquals(
|
int[] idxes = new int[] { 30, 510, 90, 150, 210, 270, 330, 390, 450 };
|
||||||
new int[] { 30, 90, 150, 210, 270, 330, 390, 450, 510 },
|
long[] ids = IntStream.of(idxes).mapToLong(this::fullId).toArray();
|
||||||
rsp.results
|
long[] actual = rsp.results
|
||||||
.stream()
|
.stream()
|
||||||
.mapToInt(SearchResultItem::getUrlIdInt)
|
.mapToLong(SearchResultItem::getDocumentId)
|
||||||
.toArray());
|
.toArray();
|
||||||
|
|
||||||
|
Assertions.assertArrayEquals(ids, actual);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
@ -127,9 +130,11 @@ public class IndexQueryServiceIntegrationTest {
|
|||||||
.subqueries(List.of(new SearchSubquery(
|
.subqueries(List.of(new SearchSubquery(
|
||||||
List.of("3", "5", "2"), List.of("4"), Collections.emptyList(), Collections.emptyList(),
|
List.of("3", "5", "2"), List.of("4"), Collections.emptyList(), Collections.emptyList(),
|
||||||
Collections.emptyList()))).build());
|
Collections.emptyList()))).build());
|
||||||
Assertions.assertArrayEquals(
|
int[] idxes = new int[] { 210, 270 };
|
||||||
new int[] { 210, 270 },
|
long[] ids = IntStream.of(idxes).mapToLong(id -> UrlIdCodec.encodeId(id/100, id)).toArray();
|
||||||
rsp.results.stream().mapToInt(SearchResultItem::getUrlIdInt).toArray());
|
long[] actual = rsp.results.stream().mapToLong(SearchResultItem::getDocumentId).toArray();
|
||||||
|
|
||||||
|
Assertions.assertArrayEquals(ids, actual);
|
||||||
}
|
}
|
||||||
|
|
||||||
@Test
|
@Test
|
||||||
@ -169,13 +174,17 @@ public class IndexQueryServiceIntegrationTest {
|
|||||||
|
|
||||||
}
|
}
|
||||||
|
|
||||||
|
private long fullId(int id) {
|
||||||
|
return UrlIdCodec.encodeId((32 - (id % 32)), id);
|
||||||
|
}
|
||||||
|
|
||||||
public void loadData(int id) {
|
public void loadData(int id) {
|
||||||
int[] factors = IntStream
|
int[] factors = IntStream
|
||||||
.rangeClosed(1, id)
|
.rangeClosed(1, id)
|
||||||
.filter(v -> (id % v) == 0)
|
.filter(v -> (id % v) == 0)
|
||||||
.toArray();
|
.toArray();
|
||||||
|
|
||||||
long fullId = id | ((long) (32 - (id % 32)) << 32);
|
long fullId = fullId(id);
|
||||||
|
|
||||||
var header = new IndexJournalEntryHeader(factors.length, 0, fullId, new DocumentMetadata(0, 0, 0, 0, id % 5, id, id % 20, (byte) 0).encode());
|
var header = new IndexJournalEntryHeader(factors.length, 0, fullId, new DocumentMetadata(0, 0, 0, 0, id % 5, id, id % 20, (byte) 0).encode());
|
||||||
|
|
||||||
@ -190,7 +199,7 @@ public class IndexQueryServiceIntegrationTest {
|
|||||||
|
|
||||||
public void loadDataWithDomain(int domain, int id) {
|
public void loadDataWithDomain(int domain, int id) {
|
||||||
int[] factors = IntStream.rangeClosed(1, id).filter(v -> (id % v) == 0).toArray();
|
int[] factors = IntStream.rangeClosed(1, id).filter(v -> (id % v) == 0).toArray();
|
||||||
var header = new IndexJournalEntryHeader(factors.length, 0, id | ((long) domain << 32), DocumentMetadata.defaultValue());
|
var header = new IndexJournalEntryHeader(factors.length, 0, UrlIdCodec.encodeId(domain, id), DocumentMetadata.defaultValue());
|
||||||
|
|
||||||
long[] data = new long[factors.length*2];
|
long[] data = new long[factors.length*2];
|
||||||
for (int i = 0; i < factors.length; i++) {
|
for (int i = 0; i < factors.length; i++) {
|
||||||
|
Loading…
Reference in New Issue
Block a user