(index) Fix priority search terms

This functionality fell into disrepair some while ago.  It's supposed to allow non-mandatory search terms that boost the ranking if they are present in the document.
This commit is contained in:
Viktor Lofgren 2024-04-15 16:44:08 +02:00
parent b6d365bacd
commit 599e719ad4
9 changed files with 68 additions and 6 deletions

View File

@ -138,6 +138,7 @@ public class QueryProtobufCodec {
rawItem.getHtmlFeatures(),
keywordScores,
rawItem.getResultsFromDomain(),
rawItem.getHasPriorityTerms(),
Double.NaN // Not set
);
}

View File

@ -28,11 +28,17 @@ public class SearchResultItem implements Comparable<SearchResultItem> {
/** How many other potential results existed in the same domain */
public int resultsFromDomain;
public SearchResultItem(long combinedId, long encodedDocMetadata, int htmlFeatures) {
public boolean hasPrioTerm;
public SearchResultItem(long combinedId,
long encodedDocMetadata,
int htmlFeatures,
boolean hasPrioTerm) {
this.combinedId = combinedId;
this.encodedDocMetadata = encodedDocMetadata;
this.keywordScores = new ArrayList<>();
this.htmlFeatures = htmlFeatures;
this.hasPrioTerm = hasPrioTerm;
}
@ -85,4 +91,6 @@ public class SearchResultItem implements Comparable<SearchResultItem> {
return Long.compare(this.combinedId, o.combinedId);
}
}

View File

@ -101,13 +101,13 @@ message RpcRawResultItem {
int64 encodedDocMetadata = 3; // bit encoded document metadata
int32 htmlFeatures = 4; // bitmask encoding features of the document
repeated RpcResultKeywordScore keywordScores = 5;
bool hasPriorityTerms = 6; // true if this word is important to the document
}
/* Information about how well a keyword matches a query */
message RpcResultKeywordScore {
string keyword = 1; // the keyword
int64 encodedWordMetadata = 2; // bit encoded word metadata
bool hasPriorityTerms = 3; // true if this word is important to the document
}
/* Query execution parameters */

View File

@ -138,6 +138,7 @@ public class IndexGrpcService extends IndexApiGrpc.IndexApiImplBase {
rawItem.setResultsFromDomain(rawResult.resultsFromDomain);
rawItem.setHtmlFeatures(rawResult.htmlFeatures);
rawItem.setEncodedDocMetadata(rawResult.encodedDocMetadata);
rawItem.setHasPriorityTerms(rawResult.hasPrioTerm);
for (var score : rawResult.keywordScores) {
rawItem.addKeywordScores(

View File

@ -43,6 +43,7 @@ public class IndexMetadataService {
public QuerySearchTerms getSearchTerms(CompiledQuery<String> compiledQuery, SearchQuery searchQuery) {
LongArrayList termIdsList = new LongArrayList();
LongArrayList termIdsPrio = new LongArrayList();
TObjectLongHashMap<String> termToId = new TObjectLongHashMap<>(10, 0.75f, -1);
@ -52,8 +53,30 @@ public class IndexMetadataService {
termToId.put(word, id);
}
for (var term : searchQuery.searchTermsAdvice) {
if (termToId.containsKey(term)) {
continue;
}
long id = SearchTermsUtil.getWordId(term);
termIdsList.add(id);
termToId.put(term, id);
}
for (var term : searchQuery.searchTermsPriority) {
if (termToId.containsKey(term)) {
continue;
}
long id = SearchTermsUtil.getWordId(term);
termIdsList.add(id);
termIdsPrio.add(id);
termToId.put(term, id);
}
return new QuerySearchTerms(termToId,
new TermIdList(termIdsList),
new TermIdList(termIdsPrio),
new TermCoherenceGroupList(
searchQuery.searchTermCoherences.stream().map(TermCoherenceGroup::new).toList()
)

View File

@ -52,7 +52,8 @@ public class IndexResultValuationContext {
this.searchTerms = metadataService.getSearchTerms(params.compiledQuery, params.query);
this.termMetadataForCombinedDocumentIds = metadataService.getTermMetadataForDocuments(ids, searchTerms.termIdsAll);
this.termMetadataForCombinedDocumentIds = metadataService.getTermMetadataForDocuments(ids,
searchTerms.termIdsAll);
}
private final long flagsFilterMask =
@ -69,7 +70,10 @@ public class IndexResultValuationContext {
long docMetadata = statefulIndex.getDocumentMetadata(docId);
int htmlFeatures = statefulIndex.getHtmlFeatures(docId);
SearchResultItem searchResult = new SearchResultItem(docId, docMetadata, htmlFeatures);
SearchResultItem searchResult = new SearchResultItem(docId,
docMetadata,
htmlFeatures,
hasPrioTerm(combinedId));
long[] wordMetas = new long[compiledQuery.size()];
SearchResultKeywordScore[] scores = new SearchResultKeywordScore[compiledQuery.size()];
@ -108,11 +112,24 @@ public class IndexResultValuationContext {
5000, // use a dummy value here as it's not present in the index
rankingContext);
if (searchResult.hasPrioTerm) {
score = 0.75 * score;
}
searchResult.setScore(score);
return searchResult;
}
private boolean hasPrioTerm(long combinedId) {
for (var term : searchTerms.termIdsPrio.array()) {
if (termMetadataForCombinedDocumentIds.hasTermMeta(term, combinedId)) {
return true;
}
}
return false;
}
private boolean meetsQueryStrategyRequirements(CompiledQueryLong queryGraphScores,
QueryStrategy queryStrategy)
{

View File

@ -6,14 +6,17 @@ import nu.marginalia.index.results.model.ids.TermIdList;
public class QuerySearchTerms {
private final TObjectLongHashMap<String> termToId;
public final TermIdList termIdsAll;
public final TermIdList termIdsPrio;
public final TermCoherenceGroupList coherences;
public QuerySearchTerms(TObjectLongHashMap<String> termToId,
TermIdList termIdsAll,
TermIdList termIdsPrio,
TermCoherenceGroupList coherences) {
this.termToId = termToId;
this.termIdsAll = termIdsAll;
this.termIdsPrio = termIdsPrio;
this.coherences = coherences;
}

View File

@ -18,12 +18,21 @@ public class TermMetadataForCombinedDocumentIds {
public long getTermMetadata(long termId, long combinedId) {
var metaByCombinedId = termdocToMeta.get(termId);
if (metaByCombinedId == null) {
logger.warn("Missing meta for term {}", termId);
return 0;
}
return metaByCombinedId.get(combinedId);
}
public boolean hasTermMeta(long termId, long combinedId) {
var metaByCombinedId = termdocToMeta.get(termId);
if (metaByCombinedId == null) {
return false;
}
return metaByCombinedId.get(combinedId) != 0;
}
public record DocumentsWithMetadata(Long2LongOpenHashMap data) {
public DocumentsWithMetadata(CombinedDocIdList combinedDocIdsAll, DocMetadataList metadata) {
this(new Long2LongOpenHashMap(combinedDocIdsAll.array(), metadata.array()));

View File

@ -25,7 +25,7 @@ class IndexResultDomainDeduplicatorTest {
}
SearchResultItem forId(int domain, int ordinal) {
return new SearchResultItem(UrlIdCodec.encodeId(domain, ordinal), 0, 0, List.of(), 4, Double.NaN);
return new SearchResultItem(UrlIdCodec.encodeId(domain, ordinal), 0, 0, List.of(), 4, Double.NaN, false);
}
}