syntax="proto3"; package nu.marginalia.api.searchquery; option java_package="nu.marginalia.api.searchquery"; option java_multiple_files=true; service QueryApi { rpc query(RpcQsQuery) returns (RpcQsResponse) {} } service IndexApi { rpc query(RpcIndexQuery) returns (stream RpcDecoratedResultItem) {} } message Empty {} /* Query Service query request */ message RpcQsQuery { string humanQuery = 1; string nearDomain = 2; repeated string tacitIncludes = 3; repeated string tacitExcludes = 4; repeated string tacitPriority = 5; repeated string tacitAdvice = 6; RpcSpecLimit quality = 7; RpcSpecLimit year = 8; RpcSpecLimit size = 9; RpcSpecLimit rank = 10; repeated int32 domainIds = 12; RpcQueryLimits queryLimits = 13; string searchSetIdentifier = 14; string queryStrategy = 15; // Named query configuration RpcTemporalBias temporalBias = 16; RpcQsQueryPagination pagination = 17; } /* Query service query response */ message RpcQsResponse { RpcIndexQuery specs = 1; repeated RpcDecoratedResultItem results = 2; repeated string searchTermsHuman = 3; repeated string problems = 4; string domain = 5; RpcQsResultPagination pagination = 6; } message RpcQsQueryPagination { int32 page = 1; int32 pageSize = 2; } message RpcQsResultPagination { int32 page = 1; int32 pageSize = 2; int32 totalResults = 3; } message RpcTemporalBias { enum Bias { NONE = 0; RECENT = 1; OLD = 2; } Bias bias = 1; } /* Index service query request */ message RpcIndexQuery { RpcQuery query = 1; repeated int32 domains = 2; // (optional) A list of domain IDs to consider string searchSetIdentifier = 3; // (optional) A named set of domains to consider string humanQuery = 4; // The search query as the user entered it RpcSpecLimit quality = 5; RpcSpecLimit year = 6; RpcSpecLimit size = 7; RpcSpecLimit rank = 8; RpcQueryLimits queryLimits = 10; string queryStrategy = 11; // Named query configuration RpcResultRankingParameters parameters = 12; } /* A tagged union encoding some limit on a field */ message RpcSpecLimit { int32 value = 1; TYPE type = 2; enum TYPE { NONE = 0; EQUALS = 1; LESS_THAN = 2; GREATER_THAN = 3; }; } /** A search result item decorated with title and description metadata from the link database */ message RpcDecoratedResultItem { RpcRawResultItem rawItem = 1; string url = 2; string title = 3; string description = 4; double urlQuality = 5; string format = 6; int32 features = 7; // bitmask encoding features of the document int32 pubYear = 8; int64 dataHash = 9; int32 wordsTotal = 10; double rankingScore = 11; // The ranking score of this search result item, lower is better int64 bestPositions = 12; RpcResultRankingDetails rankingDetails = 13; // optional, only present if exportDebugData is true in RpcResultRankingParameters int32 resultsFromDomain = 14; } /** A raw index-service view of a search result */ message RpcRawResultItem { int64 combinedId = 1; // raw ID with bit-encoded ranking information still present int64 encodedDocMetadata = 3; // bit encoded document metadata int32 htmlFeatures = 4; // bitmask encoding features of the document repeated RpcResultKeywordScore keywordScores = 5; bool hasPriorityTerms = 6; // true if this word is important to the document MATCH_TYPE matchType = 7; // the type of match this result represents enum MATCH_TYPE { FLAGS = 0; PROXIMITY = 1; PHRASE = 2; }; } /* Information about how well a keyword matches a query */ message RpcResultKeywordScore { string keyword = 1; // the keyword int32 flags = 2; int32 positions = 3; } /* Query execution parameters */ message RpcQueryLimits { int32 resultsByDomain = 1; int32 resultsTotal = 2; int32 timeoutMs = 3; int32 fetchSize = 4; // Size of the fetch buffer in the index service } /** Parameters for the result ranking function */ message RpcResultRankingParameters { double bm25K = 1; // BM25 parameter double bm25B = 2; // BM25 parameter int32 shortDocumentThreshold = 5; double shortDocumentPenalty = 6; double domainRankBonus = 7; double qualityPenalty = 8; int32 shortSentenceThreshold = 9; double shortSentencePenalty = 10; double bm25Weight = 11; // -- 12 unused -- double tcfFirstPositionWeight = 13; double tcfVerbatimWeight = 14; double tcfProximityWeight = 15; RpcTemporalBias temporalBias = 16; double temporalBiasWeight = 17; bool exportDebugData = 18; bool disablePenalties = 19; } message RpcResultRankingDetails { RpcResultDocumentRankingOutputs documentOutputs = 1; RpcResultTermRankingOutputs termOutputs = 2; } message RpcResultRankingInputs { int32 rank = 1; int32 asl = 2; int32 quality = 3; int32 size = 4; int32 topology = 5; int32 year = 6; repeated string flags = 7; } /** Summary of the output of the ranking function */ message RpcResultDocumentRankingOutputs { repeated string factor = 1; repeated string value = 2; } message RpcResultTermRankingOutputs { repeated int64 termId = 1; repeated string term = 2; repeated string factor = 3; repeated string value = 4; } /* Defines a single subquery */ message RpcQuery { repeated string include = 1; // These terms must be present repeated string exclude = 2; // These terms must be absent repeated string advice = 3; // These terms must be present, but do not affect ranking repeated string priority = 4; // These terms are not mandatory, but affect ranking positively if they are present repeated RpcPhrases phrases = 5; // Groups of terms that must exist in proximity of each other string compiledQuery = 6; // Compiled query in infix notation } /* Defines a group of search terms that must exist in the the specified order within the document */ message RpcPhrases { repeated string terms = 1; TYPE type = 2; enum TYPE { OPTIONAL = 0; MANDATORY = 1; FULL = 2; }; }