diff --git a/code/api/index-api/src/main/protobuf/index-api.proto b/code/api/index-api/src/main/protobuf/index-api.proto index bc177746..6862857f 100644 --- a/code/api/index-api/src/main/protobuf/index-api.proto +++ b/code/api/index-api/src/main/protobuf/index-api.proto @@ -13,6 +13,7 @@ service IndexApi { message Empty {} +/* Query Service query request */ message RpcQsQuery { string humanQuery = 1; string nearDomain = 2; @@ -29,6 +30,7 @@ message RpcQsQuery { string searchSetIdentifier = 13; } +/* Query service query response */ message RpcQsResponse { RpcIndexQuery specs = 1; repeated RpcDecoratedResultItem results = 2; @@ -37,20 +39,22 @@ message RpcQsResponse { string domain = 5; } +/* Index service query request */ message RpcIndexQuery { repeated RpcSubquery subqueries = 1; - repeated int32 domains = 2; - string searchSetIdentifier = 3; - string humanQuery = 4; + repeated int32 domains = 2; // (optional) A list of domain IDs to consider + string searchSetIdentifier = 3; // (optional) A named set of domains to consider + string humanQuery = 4; // The search query as the user entered it RpcSpecLimit quality = 5; RpcSpecLimit year = 6; RpcSpecLimit size = 7; RpcSpecLimit rank = 8; RpcQueryLimits queryLimits = 9; - string queryStrategy = 10; + string queryStrategy = 10; // Named query configuration RpcResultRankingParameters parameters = 11; } +/* A tagged union encoding some limit on a field */ message RpcSpecLimit { int32 value = 1; TYPE type = 2; @@ -63,10 +67,7 @@ message RpcSpecLimit { }; } -message RpcSearchResultSet { - repeated RpcDecoratedResultItem items = 1; -} - +/** A search result item decorated with title and description metadata from the link database */ message RpcDecoratedResultItem { RpcRawResultItem rawItem = 1; string url = 2; @@ -74,40 +75,43 @@ message RpcDecoratedResultItem { string description = 4; double urlQuality = 5; string format = 6; - int32 features = 7; + int32 features = 7; // bitmask encoding features of the document int32 pubYear = 8; int64 dataHash = 9; int32 wordsTotal = 10; - double rankingScore = 11; + double rankingScore = 11; // The ranking score of this search result item, lower is better } +/** A raw index-service view of a search result */ message RpcRawResultItem { - int64 combinedId = 1; - int32 resultsFromDomain = 2; + int64 combinedId = 1; // raw ID with bit-encoded ranking information still present + int32 resultsFromDomain = 2; // number of other results from the same domain repeated RpcResultKeywordScore keywordScores = 3; } +/* Information about how well a keyword matches a query */ message RpcResultKeywordScore { - int32 subquery = 1; - string keyword = 2; - int64 encodedWordMetadata = 3; - int64 encodedDocMetadata = 4; - bool hasPriorityTerms = 5; - int32 htmlFeatures = 6; + int32 subquery = 1; // index of the subquery this keyword relates to + string keyword = 2; // the keyword + int64 encodedWordMetadata = 3; // bit encoded word metadata + int64 encodedDocMetadata = 4; // bit encoded document metadata + bool hasPriorityTerms = 5; // true if this word is important to the document + int32 htmlFeatures = 6; // bit encoded document features } +/* Query execution parameters */ message RpcQueryLimits { int32 resultsByDomain = 1; int32 resultsTotal = 2; int32 timeoutMs = 3; - int32 fetchSize = 4; + int32 fetchSize = 4; // Size of the fetch buffer in the index service } message RpcResultRankingParameters { - double fullK = 1; - double fullB = 2; - double prioK = 3; - double prioB = 4; + double fullK = 1; // BM25 parameter + double fullB = 2; // BM25 parameter + double prioK = 3; // BM25 parameter + double prioB = 4; // BM25 parameter int32 shortDocumentThreshold = 5; double shortDocumentPenalty = 6; double domainRankBonus = 7; @@ -122,18 +126,21 @@ message RpcResultRankingParameters { enum TEMPORAL_BIAS { NONE = 0; - RECENT = 1; - OLD = 2; + RECENT = 1; // Prefer recent documents + OLD = 2; // Prefer older documents } } + +/* Defines a single subquery */ message RpcSubquery { - repeated string include = 1; - repeated string exclude = 2; - repeated string advice = 3; - repeated string priority = 4; - repeated RpcCoherences coherences = 5; + repeated string include = 1; // These terms must be present + repeated string exclude = 2; // These terms must be absent + repeated string advice = 3; // These terms must be present, but do not affect ranking + repeated string priority = 4; // These terms are not mandatory, but affect ranking positively if they are present + repeated RpcCoherences coherences = 5; // Groups of terms that must exist in proximity of each other } +/* Defines a group of search terms that must exist in close proximity within the document */ message RpcCoherences { repeated string coherences = 1; }