2023-10-24 09:09:12 +00:00
|
|
|
syntax="proto3";
|
2024-02-22 14:27:39 +00:00
|
|
|
package nu.marginalia.api.searchquery;
|
2023-10-24 09:09:12 +00:00
|
|
|
|
2024-02-22 14:27:39 +00:00
|
|
|
option java_package="nu.marginalia.api.searchquery";
|
2023-10-24 09:09:12 +00:00
|
|
|
option java_multiple_files=true;
|
|
|
|
|
|
|
|
service QueryApi {
|
|
|
|
rpc query(RpcQsQuery) returns (RpcQsResponse) {}
|
|
|
|
}
|
|
|
|
service IndexApi {
|
2023-10-24 11:26:46 +00:00
|
|
|
rpc query(RpcIndexQuery) returns (stream RpcDecoratedResultItem) {}
|
2023-10-24 09:09:12 +00:00
|
|
|
}
|
|
|
|
|
|
|
|
message Empty {}
|
|
|
|
|
2023-10-29 15:13:21 +00:00
|
|
|
/* Query Service query request */
|
2023-10-24 09:09:12 +00:00
|
|
|
message RpcQsQuery {
|
|
|
|
string humanQuery = 1;
|
|
|
|
string nearDomain = 2;
|
|
|
|
repeated string tacitIncludes = 3;
|
|
|
|
repeated string tacitExcludes = 4;
|
|
|
|
repeated string tacitPriority = 5;
|
|
|
|
repeated string tacitAdvice = 6;
|
|
|
|
RpcSpecLimit quality = 7;
|
|
|
|
RpcSpecLimit year = 8;
|
|
|
|
RpcSpecLimit size = 9;
|
|
|
|
RpcSpecLimit rank = 10;
|
2023-12-25 19:38:29 +00:00
|
|
|
repeated int32 domainIds = 12;
|
|
|
|
RpcQueryLimits queryLimits = 13;
|
|
|
|
string searchSetIdentifier = 14;
|
2024-02-15 12:39:51 +00:00
|
|
|
string queryStrategy = 15; // Named query configuration
|
|
|
|
RpcTemporalBias temporalBias = 16;
|
2024-09-25 10:56:38 +00:00
|
|
|
|
|
|
|
RpcQsQueryPagination pagination = 17;
|
2023-10-24 09:09:12 +00:00
|
|
|
}
|
|
|
|
|
2023-10-29 15:13:21 +00:00
|
|
|
/* Query service query response */
|
2023-10-24 09:09:12 +00:00
|
|
|
message RpcQsResponse {
|
2024-02-22 14:27:39 +00:00
|
|
|
RpcIndexQuery specs = 1;
|
|
|
|
repeated RpcDecoratedResultItem results = 2;
|
|
|
|
repeated string searchTermsHuman = 3;
|
|
|
|
repeated string problems = 4;
|
|
|
|
string domain = 5;
|
2024-09-25 10:56:38 +00:00
|
|
|
|
|
|
|
RpcQsResultPagination pagination = 6;
|
|
|
|
}
|
|
|
|
|
|
|
|
message RpcQsQueryPagination {
|
|
|
|
int32 page = 1;
|
|
|
|
int32 pageSize = 2;
|
|
|
|
}
|
|
|
|
|
|
|
|
message RpcQsResultPagination {
|
|
|
|
int32 page = 1;
|
|
|
|
int32 pageSize = 2;
|
|
|
|
int32 totalResults = 3;
|
2023-10-24 09:09:12 +00:00
|
|
|
}
|
|
|
|
|
2024-02-15 12:39:51 +00:00
|
|
|
message RpcTemporalBias {
|
|
|
|
enum Bias {
|
|
|
|
NONE = 0;
|
|
|
|
RECENT = 1;
|
|
|
|
OLD = 2;
|
|
|
|
}
|
|
|
|
Bias bias = 1;
|
|
|
|
}
|
|
|
|
|
2023-10-29 15:13:21 +00:00
|
|
|
/* Index service query request */
|
2023-10-24 09:09:12 +00:00
|
|
|
message RpcIndexQuery {
|
2024-04-04 18:17:58 +00:00
|
|
|
RpcQuery query = 1;
|
2023-10-29 15:13:21 +00:00
|
|
|
repeated int32 domains = 2; // (optional) A list of domain IDs to consider
|
|
|
|
string searchSetIdentifier = 3; // (optional) A named set of domains to consider
|
|
|
|
string humanQuery = 4; // The search query as the user entered it
|
2023-10-24 09:09:12 +00:00
|
|
|
RpcSpecLimit quality = 5;
|
|
|
|
RpcSpecLimit year = 6;
|
|
|
|
RpcSpecLimit size = 7;
|
|
|
|
RpcSpecLimit rank = 8;
|
2023-12-25 19:38:29 +00:00
|
|
|
RpcQueryLimits queryLimits = 10;
|
|
|
|
string queryStrategy = 11; // Named query configuration
|
|
|
|
RpcResultRankingParameters parameters = 12;
|
2023-10-24 09:09:12 +00:00
|
|
|
}
|
|
|
|
|
2023-10-29 15:13:21 +00:00
|
|
|
/* A tagged union encoding some limit on a field */
|
2023-10-24 09:09:12 +00:00
|
|
|
message RpcSpecLimit {
|
|
|
|
int32 value = 1;
|
|
|
|
TYPE type = 2;
|
|
|
|
|
|
|
|
enum TYPE {
|
|
|
|
NONE = 0;
|
|
|
|
EQUALS = 1;
|
|
|
|
LESS_THAN = 2;
|
|
|
|
GREATER_THAN = 3;
|
|
|
|
};
|
|
|
|
}
|
|
|
|
|
2023-10-29 15:13:21 +00:00
|
|
|
/** A search result item decorated with title and description metadata from the link database */
|
2023-10-24 09:09:12 +00:00
|
|
|
message RpcDecoratedResultItem {
|
|
|
|
RpcRawResultItem rawItem = 1;
|
|
|
|
string url = 2;
|
|
|
|
string title = 3;
|
|
|
|
string description = 4;
|
|
|
|
double urlQuality = 5;
|
|
|
|
string format = 6;
|
2023-10-29 15:13:21 +00:00
|
|
|
int32 features = 7; // bitmask encoding features of the document
|
2023-10-24 09:09:12 +00:00
|
|
|
int32 pubYear = 8;
|
|
|
|
int64 dataHash = 9;
|
|
|
|
int32 wordsTotal = 10;
|
2023-10-29 15:13:21 +00:00
|
|
|
double rankingScore = 11; // The ranking score of this search result item, lower is better
|
2024-04-07 09:24:30 +00:00
|
|
|
int64 bestPositions = 12;
|
2024-04-19 09:46:27 +00:00
|
|
|
RpcResultRankingDetails rankingDetails = 13; // optional, only present if exportDebugData is true in RpcResultRankingParameters
|
2024-07-15 03:18:10 +00:00
|
|
|
int32 resultsFromDomain = 14;
|
2023-10-24 09:09:12 +00:00
|
|
|
}
|
|
|
|
|
2023-10-29 15:13:21 +00:00
|
|
|
/** A raw index-service view of a search result */
|
2023-10-24 09:09:12 +00:00
|
|
|
message RpcRawResultItem {
|
2023-10-29 15:13:21 +00:00
|
|
|
int64 combinedId = 1; // raw ID with bit-encoded ranking information still present
|
2024-04-15 14:04:07 +00:00
|
|
|
int64 encodedDocMetadata = 3; // bit encoded document metadata
|
|
|
|
int32 htmlFeatures = 4; // bitmask encoding features of the document
|
|
|
|
repeated RpcResultKeywordScore keywordScores = 5;
|
2024-04-15 14:44:08 +00:00
|
|
|
bool hasPriorityTerms = 6; // true if this word is important to the document
|
2024-08-25 08:34:12 +00:00
|
|
|
MATCH_TYPE matchType = 7; // the type of match this result represents
|
|
|
|
|
|
|
|
enum MATCH_TYPE {
|
|
|
|
FLAGS = 0;
|
|
|
|
PROXIMITY = 1;
|
|
|
|
PHRASE = 2;
|
|
|
|
};
|
2023-10-24 09:09:12 +00:00
|
|
|
}
|
|
|
|
|
2023-10-29 15:13:21 +00:00
|
|
|
/* Information about how well a keyword matches a query */
|
2023-10-24 09:09:12 +00:00
|
|
|
message RpcResultKeywordScore {
|
2024-04-04 18:17:58 +00:00
|
|
|
string keyword = 1; // the keyword
|
2024-07-27 09:44:13 +00:00
|
|
|
int32 flags = 2;
|
|
|
|
int32 positions = 3;
|
2023-10-24 09:09:12 +00:00
|
|
|
}
|
|
|
|
|
2023-10-29 15:13:21 +00:00
|
|
|
/* Query execution parameters */
|
2023-10-24 09:09:12 +00:00
|
|
|
message RpcQueryLimits {
|
|
|
|
int32 resultsByDomain = 1;
|
|
|
|
int32 resultsTotal = 2;
|
|
|
|
int32 timeoutMs = 3;
|
2023-10-29 15:13:21 +00:00
|
|
|
int32 fetchSize = 4; // Size of the fetch buffer in the index service
|
2023-10-24 09:09:12 +00:00
|
|
|
}
|
|
|
|
|
2024-07-15 02:49:28 +00:00
|
|
|
/** Parameters for the result ranking function */
|
2023-10-24 09:09:12 +00:00
|
|
|
message RpcResultRankingParameters {
|
2024-07-15 02:49:28 +00:00
|
|
|
double bm25K = 1; // BM25 parameter
|
|
|
|
double bm25B = 2; // BM25 parameter
|
|
|
|
|
2023-10-24 09:09:12 +00:00
|
|
|
int32 shortDocumentThreshold = 5;
|
|
|
|
double shortDocumentPenalty = 6;
|
|
|
|
double domainRankBonus = 7;
|
|
|
|
double qualityPenalty = 8;
|
|
|
|
int32 shortSentenceThreshold = 9;
|
|
|
|
double shortSentencePenalty = 10;
|
2024-07-15 02:49:28 +00:00
|
|
|
double bm25Weight = 11;
|
2024-08-25 09:20:19 +00:00
|
|
|
// -- 12 unused --
|
2024-07-15 02:49:28 +00:00
|
|
|
double tcfFirstPositionWeight = 13;
|
2024-08-25 08:34:12 +00:00
|
|
|
double tcfVerbatimWeight = 14;
|
|
|
|
double tcfProximityWeight = 15;
|
2024-04-18 08:36:15 +00:00
|
|
|
RpcTemporalBias temporalBias = 16;
|
|
|
|
double temporalBiasWeight = 17;
|
2024-07-15 02:49:28 +00:00
|
|
|
|
2024-04-19 09:46:27 +00:00
|
|
|
bool exportDebugData = 18;
|
2025-01-07 23:16:49 +00:00
|
|
|
bool disablePenalties = 19;
|
2024-07-15 02:49:28 +00:00
|
|
|
|
2024-04-19 09:46:27 +00:00
|
|
|
}
|
|
|
|
|
|
|
|
message RpcResultRankingDetails {
|
2024-08-09 10:57:25 +00:00
|
|
|
RpcResultDocumentRankingOutputs documentOutputs = 1;
|
|
|
|
RpcResultTermRankingOutputs termOutputs = 2;
|
2024-04-19 09:46:27 +00:00
|
|
|
}
|
|
|
|
|
|
|
|
message RpcResultRankingInputs {
|
|
|
|
int32 rank = 1;
|
|
|
|
int32 asl = 2;
|
|
|
|
int32 quality = 3;
|
|
|
|
int32 size = 4;
|
2024-04-19 10:18:53 +00:00
|
|
|
int32 topology = 5;
|
|
|
|
int32 year = 6;
|
|
|
|
repeated string flags = 7;
|
2024-04-19 09:46:27 +00:00
|
|
|
}
|
|
|
|
|
2024-07-15 02:49:28 +00:00
|
|
|
/** Summary of the output of the ranking function */
|
2024-08-09 10:57:25 +00:00
|
|
|
message RpcResultDocumentRankingOutputs {
|
|
|
|
repeated string factor = 1;
|
|
|
|
repeated string value = 2;
|
|
|
|
}
|
|
|
|
|
|
|
|
message RpcResultTermRankingOutputs {
|
|
|
|
repeated int64 termId = 1;
|
|
|
|
repeated string term = 2;
|
|
|
|
repeated string factor = 3;
|
|
|
|
repeated string value = 4;
|
2023-10-24 09:09:12 +00:00
|
|
|
}
|
2023-10-29 15:13:21 +00:00
|
|
|
|
|
|
|
/* Defines a single subquery */
|
2024-04-04 18:17:58 +00:00
|
|
|
message RpcQuery {
|
2023-10-29 15:13:21 +00:00
|
|
|
repeated string include = 1; // These terms must be present
|
|
|
|
repeated string exclude = 2; // These terms must be absent
|
|
|
|
repeated string advice = 3; // These terms must be present, but do not affect ranking
|
|
|
|
repeated string priority = 4; // These terms are not mandatory, but affect ranking positively if they are present
|
2024-08-15 09:02:19 +00:00
|
|
|
repeated RpcPhrases phrases = 5; // Groups of terms that must exist in proximity of each other
|
2024-04-04 18:17:58 +00:00
|
|
|
string compiledQuery = 6; // Compiled query in infix notation
|
2023-10-24 09:09:12 +00:00
|
|
|
}
|
|
|
|
|
2024-08-15 09:02:19 +00:00
|
|
|
/* Defines a group of search terms that must exist in the the specified order within the document */
|
|
|
|
message RpcPhrases {
|
|
|
|
repeated string terms = 1;
|
2024-06-24 13:55:54 +00:00
|
|
|
TYPE type = 2;
|
|
|
|
|
|
|
|
enum TYPE {
|
|
|
|
OPTIONAL = 0;
|
|
|
|
MANDATORY = 1;
|
2024-08-15 09:02:19 +00:00
|
|
|
FULL = 2;
|
2024-06-24 13:55:54 +00:00
|
|
|
};
|
2023-10-24 09:09:12 +00:00
|
|
|
}
|