mirror of
https://github.com/MarginaliaSearch/MarginaliaSearch.git
synced 2025-02-24 05:18:58 +00:00
(index) Fix priority search terms
This functionality fell into disrepair some while ago. It's supposed to allow non-mandatory search terms that boost the ranking if they are present in the document.
This commit is contained in:
parent
6efc0f21fe
commit
155be1078d
@ -138,6 +138,7 @@ public class QueryProtobufCodec {
|
||||
rawItem.getHtmlFeatures(),
|
||||
keywordScores,
|
||||
rawItem.getResultsFromDomain(),
|
||||
rawItem.getHasPriorityTerms(),
|
||||
Double.NaN // Not set
|
||||
);
|
||||
}
|
||||
|
@ -28,11 +28,17 @@ public class SearchResultItem implements Comparable<SearchResultItem> {
|
||||
/** How many other potential results existed in the same domain */
|
||||
public int resultsFromDomain;
|
||||
|
||||
public SearchResultItem(long combinedId, long encodedDocMetadata, int htmlFeatures) {
|
||||
public boolean hasPrioTerm;
|
||||
|
||||
public SearchResultItem(long combinedId,
|
||||
long encodedDocMetadata,
|
||||
int htmlFeatures,
|
||||
boolean hasPrioTerm) {
|
||||
this.combinedId = combinedId;
|
||||
this.encodedDocMetadata = encodedDocMetadata;
|
||||
this.keywordScores = new ArrayList<>();
|
||||
this.htmlFeatures = htmlFeatures;
|
||||
this.hasPrioTerm = hasPrioTerm;
|
||||
}
|
||||
|
||||
|
||||
@ -85,4 +91,6 @@ public class SearchResultItem implements Comparable<SearchResultItem> {
|
||||
|
||||
return Long.compare(this.combinedId, o.combinedId);
|
||||
}
|
||||
|
||||
|
||||
}
|
||||
|
@ -101,13 +101,13 @@ message RpcRawResultItem {
|
||||
int64 encodedDocMetadata = 3; // bit encoded document metadata
|
||||
int32 htmlFeatures = 4; // bitmask encoding features of the document
|
||||
repeated RpcResultKeywordScore keywordScores = 5;
|
||||
bool hasPriorityTerms = 6; // true if this word is important to the document
|
||||
}
|
||||
|
||||
/* Information about how well a keyword matches a query */
|
||||
message RpcResultKeywordScore {
|
||||
string keyword = 1; // the keyword
|
||||
int64 encodedWordMetadata = 2; // bit encoded word metadata
|
||||
bool hasPriorityTerms = 3; // true if this word is important to the document
|
||||
}
|
||||
|
||||
/* Query execution parameters */
|
||||
|
@ -138,6 +138,7 @@ public class IndexGrpcService extends IndexApiGrpc.IndexApiImplBase {
|
||||
rawItem.setResultsFromDomain(rawResult.resultsFromDomain);
|
||||
rawItem.setHtmlFeatures(rawResult.htmlFeatures);
|
||||
rawItem.setEncodedDocMetadata(rawResult.encodedDocMetadata);
|
||||
rawItem.setHasPriorityTerms(rawResult.hasPrioTerm);
|
||||
|
||||
for (var score : rawResult.keywordScores) {
|
||||
rawItem.addKeywordScores(
|
||||
|
@ -43,6 +43,7 @@ public class IndexMetadataService {
|
||||
public QuerySearchTerms getSearchTerms(CompiledQuery<String> compiledQuery, SearchQuery searchQuery) {
|
||||
|
||||
LongArrayList termIdsList = new LongArrayList();
|
||||
LongArrayList termIdsPrio = new LongArrayList();
|
||||
|
||||
TObjectLongHashMap<String> termToId = new TObjectLongHashMap<>(10, 0.75f, -1);
|
||||
|
||||
@ -52,8 +53,30 @@ public class IndexMetadataService {
|
||||
termToId.put(word, id);
|
||||
}
|
||||
|
||||
for (var term : searchQuery.searchTermsAdvice) {
|
||||
if (termToId.containsKey(term)) {
|
||||
continue;
|
||||
}
|
||||
|
||||
long id = SearchTermsUtil.getWordId(term);
|
||||
termIdsList.add(id);
|
||||
termToId.put(term, id);
|
||||
}
|
||||
|
||||
for (var term : searchQuery.searchTermsPriority) {
|
||||
if (termToId.containsKey(term)) {
|
||||
continue;
|
||||
}
|
||||
|
||||
long id = SearchTermsUtil.getWordId(term);
|
||||
termIdsList.add(id);
|
||||
termIdsPrio.add(id);
|
||||
termToId.put(term, id);
|
||||
}
|
||||
|
||||
return new QuerySearchTerms(termToId,
|
||||
new TermIdList(termIdsList),
|
||||
new TermIdList(termIdsPrio),
|
||||
new TermCoherenceGroupList(
|
||||
searchQuery.searchTermCoherences.stream().map(TermCoherenceGroup::new).toList()
|
||||
)
|
||||
|
@ -52,7 +52,8 @@ public class IndexResultValuationContext {
|
||||
|
||||
this.searchTerms = metadataService.getSearchTerms(params.compiledQuery, params.query);
|
||||
|
||||
this.termMetadataForCombinedDocumentIds = metadataService.getTermMetadataForDocuments(ids, searchTerms.termIdsAll);
|
||||
this.termMetadataForCombinedDocumentIds = metadataService.getTermMetadataForDocuments(ids,
|
||||
searchTerms.termIdsAll);
|
||||
}
|
||||
|
||||
private final long flagsFilterMask =
|
||||
@ -69,7 +70,10 @@ public class IndexResultValuationContext {
|
||||
long docMetadata = statefulIndex.getDocumentMetadata(docId);
|
||||
int htmlFeatures = statefulIndex.getHtmlFeatures(docId);
|
||||
|
||||
SearchResultItem searchResult = new SearchResultItem(docId, docMetadata, htmlFeatures);
|
||||
SearchResultItem searchResult = new SearchResultItem(docId,
|
||||
docMetadata,
|
||||
htmlFeatures,
|
||||
hasPrioTerm(combinedId));
|
||||
|
||||
long[] wordMetas = new long[compiledQuery.size()];
|
||||
SearchResultKeywordScore[] scores = new SearchResultKeywordScore[compiledQuery.size()];
|
||||
@ -108,11 +112,24 @@ public class IndexResultValuationContext {
|
||||
5000, // use a dummy value here as it's not present in the index
|
||||
rankingContext);
|
||||
|
||||
if (searchResult.hasPrioTerm) {
|
||||
score = 0.75 * score;
|
||||
}
|
||||
|
||||
searchResult.setScore(score);
|
||||
|
||||
return searchResult;
|
||||
}
|
||||
|
||||
private boolean hasPrioTerm(long combinedId) {
|
||||
for (var term : searchTerms.termIdsPrio.array()) {
|
||||
if (termMetadataForCombinedDocumentIds.hasTermMeta(term, combinedId)) {
|
||||
return true;
|
||||
}
|
||||
}
|
||||
return false;
|
||||
}
|
||||
|
||||
private boolean meetsQueryStrategyRequirements(CompiledQueryLong queryGraphScores,
|
||||
QueryStrategy queryStrategy)
|
||||
{
|
||||
|
@ -6,14 +6,17 @@ import nu.marginalia.index.results.model.ids.TermIdList;
|
||||
public class QuerySearchTerms {
|
||||
private final TObjectLongHashMap<String> termToId;
|
||||
public final TermIdList termIdsAll;
|
||||
public final TermIdList termIdsPrio;
|
||||
|
||||
public final TermCoherenceGroupList coherences;
|
||||
|
||||
public QuerySearchTerms(TObjectLongHashMap<String> termToId,
|
||||
TermIdList termIdsAll,
|
||||
TermIdList termIdsPrio,
|
||||
TermCoherenceGroupList coherences) {
|
||||
this.termToId = termToId;
|
||||
this.termIdsAll = termIdsAll;
|
||||
this.termIdsPrio = termIdsPrio;
|
||||
this.coherences = coherences;
|
||||
}
|
||||
|
||||
|
@ -18,12 +18,21 @@ public class TermMetadataForCombinedDocumentIds {
|
||||
public long getTermMetadata(long termId, long combinedId) {
|
||||
var metaByCombinedId = termdocToMeta.get(termId);
|
||||
if (metaByCombinedId == null) {
|
||||
logger.warn("Missing meta for term {}", termId);
|
||||
return 0;
|
||||
}
|
||||
return metaByCombinedId.get(combinedId);
|
||||
}
|
||||
|
||||
public boolean hasTermMeta(long termId, long combinedId) {
|
||||
var metaByCombinedId = termdocToMeta.get(termId);
|
||||
|
||||
if (metaByCombinedId == null) {
|
||||
return false;
|
||||
}
|
||||
|
||||
return metaByCombinedId.get(combinedId) != 0;
|
||||
}
|
||||
|
||||
public record DocumentsWithMetadata(Long2LongOpenHashMap data) {
|
||||
public DocumentsWithMetadata(CombinedDocIdList combinedDocIdsAll, DocMetadataList metadata) {
|
||||
this(new Long2LongOpenHashMap(combinedDocIdsAll.array(), metadata.array()));
|
||||
|
@ -25,7 +25,7 @@ class IndexResultDomainDeduplicatorTest {
|
||||
}
|
||||
|
||||
SearchResultItem forId(int domain, int ordinal) {
|
||||
return new SearchResultItem(UrlIdCodec.encodeId(domain, ordinal), 0, 0, List.of(), 4, Double.NaN);
|
||||
return new SearchResultItem(UrlIdCodec.encodeId(domain, ordinal), 0, 0, List.of(), 4, Double.NaN, false);
|
||||
}
|
||||
|
||||
}
|
Loading…
Reference in New Issue
Block a user