MarginaliaSearch/code/functions/search-query/api/src/main/protobuf/query-api.proto
Viktor Lofgren a84a06975c (ranking-params) Add disable penalties flag to ranking params
This will help debugging ranking issues.  Later it may be added to some filters.
2025-01-08 00:16:49 +01:00

218 lines
5.8 KiB
Protocol Buffer

syntax="proto3";
package nu.marginalia.api.searchquery;
option java_package="nu.marginalia.api.searchquery";
option java_multiple_files=true;
service QueryApi {
rpc query(RpcQsQuery) returns (RpcQsResponse) {}
}
service IndexApi {
rpc query(RpcIndexQuery) returns (stream RpcDecoratedResultItem) {}
}
message Empty {}
/* Query Service query request */
message RpcQsQuery {
string humanQuery = 1;
string nearDomain = 2;
repeated string tacitIncludes = 3;
repeated string tacitExcludes = 4;
repeated string tacitPriority = 5;
repeated string tacitAdvice = 6;
RpcSpecLimit quality = 7;
RpcSpecLimit year = 8;
RpcSpecLimit size = 9;
RpcSpecLimit rank = 10;
repeated int32 domainIds = 12;
RpcQueryLimits queryLimits = 13;
string searchSetIdentifier = 14;
string queryStrategy = 15; // Named query configuration
RpcTemporalBias temporalBias = 16;
RpcQsQueryPagination pagination = 17;
}
/* Query service query response */
message RpcQsResponse {
RpcIndexQuery specs = 1;
repeated RpcDecoratedResultItem results = 2;
repeated string searchTermsHuman = 3;
repeated string problems = 4;
string domain = 5;
RpcQsResultPagination pagination = 6;
}
message RpcQsQueryPagination {
int32 page = 1;
int32 pageSize = 2;
}
message RpcQsResultPagination {
int32 page = 1;
int32 pageSize = 2;
int32 totalResults = 3;
}
message RpcTemporalBias {
enum Bias {
NONE = 0;
RECENT = 1;
OLD = 2;
}
Bias bias = 1;
}
/* Index service query request */
message RpcIndexQuery {
RpcQuery query = 1;
repeated int32 domains = 2; // (optional) A list of domain IDs to consider
string searchSetIdentifier = 3; // (optional) A named set of domains to consider
string humanQuery = 4; // The search query as the user entered it
RpcSpecLimit quality = 5;
RpcSpecLimit year = 6;
RpcSpecLimit size = 7;
RpcSpecLimit rank = 8;
RpcQueryLimits queryLimits = 10;
string queryStrategy = 11; // Named query configuration
RpcResultRankingParameters parameters = 12;
}
/* A tagged union encoding some limit on a field */
message RpcSpecLimit {
int32 value = 1;
TYPE type = 2;
enum TYPE {
NONE = 0;
EQUALS = 1;
LESS_THAN = 2;
GREATER_THAN = 3;
};
}
/** A search result item decorated with title and description metadata from the link database */
message RpcDecoratedResultItem {
RpcRawResultItem rawItem = 1;
string url = 2;
string title = 3;
string description = 4;
double urlQuality = 5;
string format = 6;
int32 features = 7; // bitmask encoding features of the document
int32 pubYear = 8;
int64 dataHash = 9;
int32 wordsTotal = 10;
double rankingScore = 11; // The ranking score of this search result item, lower is better
int64 bestPositions = 12;
RpcResultRankingDetails rankingDetails = 13; // optional, only present if exportDebugData is true in RpcResultRankingParameters
int32 resultsFromDomain = 14;
}
/** A raw index-service view of a search result */
message RpcRawResultItem {
int64 combinedId = 1; // raw ID with bit-encoded ranking information still present
int64 encodedDocMetadata = 3; // bit encoded document metadata
int32 htmlFeatures = 4; // bitmask encoding features of the document
repeated RpcResultKeywordScore keywordScores = 5;
bool hasPriorityTerms = 6; // true if this word is important to the document
MATCH_TYPE matchType = 7; // the type of match this result represents
enum MATCH_TYPE {
FLAGS = 0;
PROXIMITY = 1;
PHRASE = 2;
};
}
/* Information about how well a keyword matches a query */
message RpcResultKeywordScore {
string keyword = 1; // the keyword
int32 flags = 2;
int32 positions = 3;
}
/* Query execution parameters */
message RpcQueryLimits {
int32 resultsByDomain = 1;
int32 resultsTotal = 2;
int32 timeoutMs = 3;
int32 fetchSize = 4; // Size of the fetch buffer in the index service
}
/** Parameters for the result ranking function */
message RpcResultRankingParameters {
double bm25K = 1; // BM25 parameter
double bm25B = 2; // BM25 parameter
int32 shortDocumentThreshold = 5;
double shortDocumentPenalty = 6;
double domainRankBonus = 7;
double qualityPenalty = 8;
int32 shortSentenceThreshold = 9;
double shortSentencePenalty = 10;
double bm25Weight = 11;
// -- 12 unused --
double tcfFirstPositionWeight = 13;
double tcfVerbatimWeight = 14;
double tcfProximityWeight = 15;
RpcTemporalBias temporalBias = 16;
double temporalBiasWeight = 17;
bool exportDebugData = 18;
bool disablePenalties = 19;
}
message RpcResultRankingDetails {
RpcResultDocumentRankingOutputs documentOutputs = 1;
RpcResultTermRankingOutputs termOutputs = 2;
}
message RpcResultRankingInputs {
int32 rank = 1;
int32 asl = 2;
int32 quality = 3;
int32 size = 4;
int32 topology = 5;
int32 year = 6;
repeated string flags = 7;
}
/** Summary of the output of the ranking function */
message RpcResultDocumentRankingOutputs {
repeated string factor = 1;
repeated string value = 2;
}
message RpcResultTermRankingOutputs {
repeated int64 termId = 1;
repeated string term = 2;
repeated string factor = 3;
repeated string value = 4;
}
/* Defines a single subquery */
message RpcQuery {
repeated string include = 1; // These terms must be present
repeated string exclude = 2; // These terms must be absent
repeated string advice = 3; // These terms must be present, but do not affect ranking
repeated string priority = 4; // These terms are not mandatory, but affect ranking positively if they are present
repeated RpcPhrases phrases = 5; // Groups of terms that must exist in proximity of each other
string compiledQuery = 6; // Compiled query in infix notation
}
/* Defines a group of search terms that must exist in the the specified order within the document */
message RpcPhrases {
repeated string terms = 1;
TYPE type = 2;
enum TYPE {
OPTIONAL = 0;
MANDATORY = 1;
FULL = 2;
};
}