mirror of
https://github.com/MarginaliaSearch/MarginaliaSearch.git
synced 2025-02-24 05:18:58 +00:00
(index) Fix priority search terms
This functionality fell into disrepair some while ago. It's supposed to allow non-mandatory search terms that boost the ranking if they are present in the document.
This commit is contained in:
parent
b6d365bacd
commit
599e719ad4
@ -138,6 +138,7 @@ public class QueryProtobufCodec {
|
|||||||
rawItem.getHtmlFeatures(),
|
rawItem.getHtmlFeatures(),
|
||||||
keywordScores,
|
keywordScores,
|
||||||
rawItem.getResultsFromDomain(),
|
rawItem.getResultsFromDomain(),
|
||||||
|
rawItem.getHasPriorityTerms(),
|
||||||
Double.NaN // Not set
|
Double.NaN // Not set
|
||||||
);
|
);
|
||||||
}
|
}
|
||||||
|
@ -28,11 +28,17 @@ public class SearchResultItem implements Comparable<SearchResultItem> {
|
|||||||
/** How many other potential results existed in the same domain */
|
/** How many other potential results existed in the same domain */
|
||||||
public int resultsFromDomain;
|
public int resultsFromDomain;
|
||||||
|
|
||||||
public SearchResultItem(long combinedId, long encodedDocMetadata, int htmlFeatures) {
|
public boolean hasPrioTerm;
|
||||||
|
|
||||||
|
public SearchResultItem(long combinedId,
|
||||||
|
long encodedDocMetadata,
|
||||||
|
int htmlFeatures,
|
||||||
|
boolean hasPrioTerm) {
|
||||||
this.combinedId = combinedId;
|
this.combinedId = combinedId;
|
||||||
this.encodedDocMetadata = encodedDocMetadata;
|
this.encodedDocMetadata = encodedDocMetadata;
|
||||||
this.keywordScores = new ArrayList<>();
|
this.keywordScores = new ArrayList<>();
|
||||||
this.htmlFeatures = htmlFeatures;
|
this.htmlFeatures = htmlFeatures;
|
||||||
|
this.hasPrioTerm = hasPrioTerm;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
@ -85,4 +91,6 @@ public class SearchResultItem implements Comparable<SearchResultItem> {
|
|||||||
|
|
||||||
return Long.compare(this.combinedId, o.combinedId);
|
return Long.compare(this.combinedId, o.combinedId);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
}
|
}
|
||||||
|
@ -101,13 +101,13 @@ message RpcRawResultItem {
|
|||||||
int64 encodedDocMetadata = 3; // bit encoded document metadata
|
int64 encodedDocMetadata = 3; // bit encoded document metadata
|
||||||
int32 htmlFeatures = 4; // bitmask encoding features of the document
|
int32 htmlFeatures = 4; // bitmask encoding features of the document
|
||||||
repeated RpcResultKeywordScore keywordScores = 5;
|
repeated RpcResultKeywordScore keywordScores = 5;
|
||||||
|
bool hasPriorityTerms = 6; // true if this word is important to the document
|
||||||
}
|
}
|
||||||
|
|
||||||
/* Information about how well a keyword matches a query */
|
/* Information about how well a keyword matches a query */
|
||||||
message RpcResultKeywordScore {
|
message RpcResultKeywordScore {
|
||||||
string keyword = 1; // the keyword
|
string keyword = 1; // the keyword
|
||||||
int64 encodedWordMetadata = 2; // bit encoded word metadata
|
int64 encodedWordMetadata = 2; // bit encoded word metadata
|
||||||
bool hasPriorityTerms = 3; // true if this word is important to the document
|
|
||||||
}
|
}
|
||||||
|
|
||||||
/* Query execution parameters */
|
/* Query execution parameters */
|
||||||
|
@ -138,6 +138,7 @@ public class IndexGrpcService extends IndexApiGrpc.IndexApiImplBase {
|
|||||||
rawItem.setResultsFromDomain(rawResult.resultsFromDomain);
|
rawItem.setResultsFromDomain(rawResult.resultsFromDomain);
|
||||||
rawItem.setHtmlFeatures(rawResult.htmlFeatures);
|
rawItem.setHtmlFeatures(rawResult.htmlFeatures);
|
||||||
rawItem.setEncodedDocMetadata(rawResult.encodedDocMetadata);
|
rawItem.setEncodedDocMetadata(rawResult.encodedDocMetadata);
|
||||||
|
rawItem.setHasPriorityTerms(rawResult.hasPrioTerm);
|
||||||
|
|
||||||
for (var score : rawResult.keywordScores) {
|
for (var score : rawResult.keywordScores) {
|
||||||
rawItem.addKeywordScores(
|
rawItem.addKeywordScores(
|
||||||
|
@ -43,6 +43,7 @@ public class IndexMetadataService {
|
|||||||
public QuerySearchTerms getSearchTerms(CompiledQuery<String> compiledQuery, SearchQuery searchQuery) {
|
public QuerySearchTerms getSearchTerms(CompiledQuery<String> compiledQuery, SearchQuery searchQuery) {
|
||||||
|
|
||||||
LongArrayList termIdsList = new LongArrayList();
|
LongArrayList termIdsList = new LongArrayList();
|
||||||
|
LongArrayList termIdsPrio = new LongArrayList();
|
||||||
|
|
||||||
TObjectLongHashMap<String> termToId = new TObjectLongHashMap<>(10, 0.75f, -1);
|
TObjectLongHashMap<String> termToId = new TObjectLongHashMap<>(10, 0.75f, -1);
|
||||||
|
|
||||||
@ -52,8 +53,30 @@ public class IndexMetadataService {
|
|||||||
termToId.put(word, id);
|
termToId.put(word, id);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
for (var term : searchQuery.searchTermsAdvice) {
|
||||||
|
if (termToId.containsKey(term)) {
|
||||||
|
continue;
|
||||||
|
}
|
||||||
|
|
||||||
|
long id = SearchTermsUtil.getWordId(term);
|
||||||
|
termIdsList.add(id);
|
||||||
|
termToId.put(term, id);
|
||||||
|
}
|
||||||
|
|
||||||
|
for (var term : searchQuery.searchTermsPriority) {
|
||||||
|
if (termToId.containsKey(term)) {
|
||||||
|
continue;
|
||||||
|
}
|
||||||
|
|
||||||
|
long id = SearchTermsUtil.getWordId(term);
|
||||||
|
termIdsList.add(id);
|
||||||
|
termIdsPrio.add(id);
|
||||||
|
termToId.put(term, id);
|
||||||
|
}
|
||||||
|
|
||||||
return new QuerySearchTerms(termToId,
|
return new QuerySearchTerms(termToId,
|
||||||
new TermIdList(termIdsList),
|
new TermIdList(termIdsList),
|
||||||
|
new TermIdList(termIdsPrio),
|
||||||
new TermCoherenceGroupList(
|
new TermCoherenceGroupList(
|
||||||
searchQuery.searchTermCoherences.stream().map(TermCoherenceGroup::new).toList()
|
searchQuery.searchTermCoherences.stream().map(TermCoherenceGroup::new).toList()
|
||||||
)
|
)
|
||||||
|
@ -52,7 +52,8 @@ public class IndexResultValuationContext {
|
|||||||
|
|
||||||
this.searchTerms = metadataService.getSearchTerms(params.compiledQuery, params.query);
|
this.searchTerms = metadataService.getSearchTerms(params.compiledQuery, params.query);
|
||||||
|
|
||||||
this.termMetadataForCombinedDocumentIds = metadataService.getTermMetadataForDocuments(ids, searchTerms.termIdsAll);
|
this.termMetadataForCombinedDocumentIds = metadataService.getTermMetadataForDocuments(ids,
|
||||||
|
searchTerms.termIdsAll);
|
||||||
}
|
}
|
||||||
|
|
||||||
private final long flagsFilterMask =
|
private final long flagsFilterMask =
|
||||||
@ -69,7 +70,10 @@ public class IndexResultValuationContext {
|
|||||||
long docMetadata = statefulIndex.getDocumentMetadata(docId);
|
long docMetadata = statefulIndex.getDocumentMetadata(docId);
|
||||||
int htmlFeatures = statefulIndex.getHtmlFeatures(docId);
|
int htmlFeatures = statefulIndex.getHtmlFeatures(docId);
|
||||||
|
|
||||||
SearchResultItem searchResult = new SearchResultItem(docId, docMetadata, htmlFeatures);
|
SearchResultItem searchResult = new SearchResultItem(docId,
|
||||||
|
docMetadata,
|
||||||
|
htmlFeatures,
|
||||||
|
hasPrioTerm(combinedId));
|
||||||
|
|
||||||
long[] wordMetas = new long[compiledQuery.size()];
|
long[] wordMetas = new long[compiledQuery.size()];
|
||||||
SearchResultKeywordScore[] scores = new SearchResultKeywordScore[compiledQuery.size()];
|
SearchResultKeywordScore[] scores = new SearchResultKeywordScore[compiledQuery.size()];
|
||||||
@ -108,11 +112,24 @@ public class IndexResultValuationContext {
|
|||||||
5000, // use a dummy value here as it's not present in the index
|
5000, // use a dummy value here as it's not present in the index
|
||||||
rankingContext);
|
rankingContext);
|
||||||
|
|
||||||
|
if (searchResult.hasPrioTerm) {
|
||||||
|
score = 0.75 * score;
|
||||||
|
}
|
||||||
|
|
||||||
searchResult.setScore(score);
|
searchResult.setScore(score);
|
||||||
|
|
||||||
return searchResult;
|
return searchResult;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
private boolean hasPrioTerm(long combinedId) {
|
||||||
|
for (var term : searchTerms.termIdsPrio.array()) {
|
||||||
|
if (termMetadataForCombinedDocumentIds.hasTermMeta(term, combinedId)) {
|
||||||
|
return true;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
|
||||||
private boolean meetsQueryStrategyRequirements(CompiledQueryLong queryGraphScores,
|
private boolean meetsQueryStrategyRequirements(CompiledQueryLong queryGraphScores,
|
||||||
QueryStrategy queryStrategy)
|
QueryStrategy queryStrategy)
|
||||||
{
|
{
|
||||||
|
@ -6,14 +6,17 @@ import nu.marginalia.index.results.model.ids.TermIdList;
|
|||||||
public class QuerySearchTerms {
|
public class QuerySearchTerms {
|
||||||
private final TObjectLongHashMap<String> termToId;
|
private final TObjectLongHashMap<String> termToId;
|
||||||
public final TermIdList termIdsAll;
|
public final TermIdList termIdsAll;
|
||||||
|
public final TermIdList termIdsPrio;
|
||||||
|
|
||||||
public final TermCoherenceGroupList coherences;
|
public final TermCoherenceGroupList coherences;
|
||||||
|
|
||||||
public QuerySearchTerms(TObjectLongHashMap<String> termToId,
|
public QuerySearchTerms(TObjectLongHashMap<String> termToId,
|
||||||
TermIdList termIdsAll,
|
TermIdList termIdsAll,
|
||||||
|
TermIdList termIdsPrio,
|
||||||
TermCoherenceGroupList coherences) {
|
TermCoherenceGroupList coherences) {
|
||||||
this.termToId = termToId;
|
this.termToId = termToId;
|
||||||
this.termIdsAll = termIdsAll;
|
this.termIdsAll = termIdsAll;
|
||||||
|
this.termIdsPrio = termIdsPrio;
|
||||||
this.coherences = coherences;
|
this.coherences = coherences;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -18,12 +18,21 @@ public class TermMetadataForCombinedDocumentIds {
|
|||||||
public long getTermMetadata(long termId, long combinedId) {
|
public long getTermMetadata(long termId, long combinedId) {
|
||||||
var metaByCombinedId = termdocToMeta.get(termId);
|
var metaByCombinedId = termdocToMeta.get(termId);
|
||||||
if (metaByCombinedId == null) {
|
if (metaByCombinedId == null) {
|
||||||
logger.warn("Missing meta for term {}", termId);
|
|
||||||
return 0;
|
return 0;
|
||||||
}
|
}
|
||||||
return metaByCombinedId.get(combinedId);
|
return metaByCombinedId.get(combinedId);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
public boolean hasTermMeta(long termId, long combinedId) {
|
||||||
|
var metaByCombinedId = termdocToMeta.get(termId);
|
||||||
|
|
||||||
|
if (metaByCombinedId == null) {
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
|
||||||
|
return metaByCombinedId.get(combinedId) != 0;
|
||||||
|
}
|
||||||
|
|
||||||
public record DocumentsWithMetadata(Long2LongOpenHashMap data) {
|
public record DocumentsWithMetadata(Long2LongOpenHashMap data) {
|
||||||
public DocumentsWithMetadata(CombinedDocIdList combinedDocIdsAll, DocMetadataList metadata) {
|
public DocumentsWithMetadata(CombinedDocIdList combinedDocIdsAll, DocMetadataList metadata) {
|
||||||
this(new Long2LongOpenHashMap(combinedDocIdsAll.array(), metadata.array()));
|
this(new Long2LongOpenHashMap(combinedDocIdsAll.array(), metadata.array()));
|
||||||
|
@ -25,7 +25,7 @@ class IndexResultDomainDeduplicatorTest {
|
|||||||
}
|
}
|
||||||
|
|
||||||
SearchResultItem forId(int domain, int ordinal) {
|
SearchResultItem forId(int domain, int ordinal) {
|
||||||
return new SearchResultItem(UrlIdCodec.encodeId(domain, ordinal), 0, 0, List.of(), 4, Double.NaN);
|
return new SearchResultItem(UrlIdCodec.encodeId(domain, ordinal), 0, 0, List.of(), 4, Double.NaN, false);
|
||||||
}
|
}
|
||||||
|
|
||||||
}
|
}
|
Loading…
Reference in New Issue
Block a user