dynamo-llm 1.0.2

// SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
// SPDX-License-Identifier: Apache-2.0

syntax = "proto3";

package inference;

//@@.. cpp:namespace:: inference

import "model_config.proto";

//@@
//@@.. cpp:var:: service InferenceService
//@@
//@@   Inference Server GRPC endpoints.
//@@
service GRPCInferenceService
{
  //@@  .. cpp:var:: rpc ServerLive(ServerLiveRequest) returns
  //@@       (ServerLiveResponse)
  //@@
  //@@     Check liveness of the inference server.
  //@@
  rpc ServerLive(ServerLiveRequest) returns (ServerLiveResponse) {}

  //@@  .. cpp:var:: rpc ServerReady(ServerReadyRequest) returns
  //@@       (ServerReadyResponse)
  //@@
  //@@     Check readiness of the inference server.
  //@@
  rpc ServerReady(ServerReadyRequest) returns (ServerReadyResponse) {}

  //@@  .. cpp:var:: rpc ModelReady(ModelReadyRequest) returns
  //@@       (ModelReadyResponse)
  //@@
  //@@     Check readiness of a model in the inference server.
  //@@
  rpc ModelReady(ModelReadyRequest) returns (ModelReadyResponse) {}

  //@@  .. cpp:var:: rpc ModelMetadata(ModelMetadataRequest) returns
  //@@       (ModelMetadataResponse)
  //@@
  //@@     Get model metadata.
  //@@
  rpc ModelMetadata(ModelMetadataRequest) returns (ModelMetadataResponse) {}

  //@@  .. cpp:var:: rpc ModelInfer(ModelInferRequest) returns
  //@@       (ModelInferResponse)
  //@@
  //@@     Perform inference using a specific model.
  //@@
  rpc ModelInfer(ModelInferRequest) returns (ModelInferResponse) {}

  //@@  .. cpp:var:: rpc ModelStreamInfer(stream ModelInferRequest) returns
  //@@       (stream ModelStreamInferResponse)
  //@@
  //@@     Perform streaming inference.
  //@@
  rpc ModelStreamInfer(stream ModelInferRequest) returns (stream ModelStreamInferResponse) {}

  //@@  .. cpp:var:: rpc ModelConfig(ModelConfigRequest) returns
  //@@       (ModelConfigResponse)
  //@@
  //@@     Get model configuration.
  //@@
  rpc ModelConfig(ModelConfigRequest) returns (ModelConfigResponse) {}
}

//@@
//@@.. cpp:var:: message ServerLiveRequest
//@@
//@@   Request message for ServerLive.
//@@
message ServerLiveRequest {}

//@@
//@@.. cpp:var:: message ServerLiveResponse
//@@
//@@   Response message for ServerLive.
//@@
message ServerLiveResponse
{
  //@@
  //@@  .. cpp:var:: bool live
  //@@
  //@@     True if the inference server is live, false if not live.
  //@@
  bool live = 1;
}

//@@
//@@.. cpp:var:: message ServerReadyRequest
//@@
//@@   Request message for ServerReady.
//@@
message ServerReadyRequest {}

//@@
//@@.. cpp:var:: message ServerReadyResponse
//@@
//@@   Response message for ServerReady.
//@@
message ServerReadyResponse
{
  //@@
  //@@  .. cpp:var:: bool ready
  //@@
  //@@     True if the inference server is ready, false if not ready. The server
  //@@     is considered ready if it has any registered models, since models
  //@@     can freely be registered and unregistered at runtime.
  //@@
  bool ready = 1;
}

//@@
//@@.. cpp:var:: message ModelReadyRequest
//@@
//@@   Request message for ModelReady.
//@@
message ModelReadyRequest
{
  //@@
  //@@  .. cpp:var:: string name
  //@@
  //@@     The name of the model to check for readiness.
  //@@
  string name = 1;

  //@@  .. cpp:var:: string version
  //@@
  //@@     The version of the model to check for readiness. If not given the
  //@@     server will choose a version based on the model and internal policy.
  //@@
  string version = 2;
}

//@@
//@@.. cpp:var:: message ModelReadyResponse
//@@
//@@   Response message for ModelReady.
//@@
message ModelReadyResponse
{
  //@@
  //@@  .. cpp:var:: bool ready
  //@@
  //@@     True if the model is ready, false if not ready.
  //@@
  bool ready = 1;
}

//@@
//@@.. cpp:var:: message ModelMetadataRequest
//@@
//@@   Request message for ModelMetadata.
//@@
message ModelMetadataRequest
{
  //@@
  //@@  .. cpp:var:: string name
  //@@
  //@@     The name of the model.
  //@@
  string name = 1;

  //@@  .. cpp:var:: string version
  //@@
  //@@     The version of the model to check for readiness. If not
  //@@     given the server will choose a version based on the
  //@@     model and internal policy.
  //@@
  string version = 2;
}

//@@
//@@.. cpp:var:: message ModelMetadataResponse
//@@
//@@   Response message for ModelMetadata.
//@@
message ModelMetadataResponse
{
  //@@
  //@@  .. cpp:var:: message TensorMetadata
  //@@
  //@@     Metadata for a tensor.
  //@@
  message TensorMetadata
  {
    //@@
    //@@    .. cpp:var:: string name
    //@@
    //@@       The tensor name.
    //@@
    string name = 1;

    //@@
    //@@    .. cpp:var:: string datatype
    //@@
    //@@       The tensor data type.
    //@@
    string datatype = 2;

    //@@
    //@@    .. cpp:var:: int64 shape (repeated)
    //@@
    //@@       The tensor shape. A variable-size dimension is represented
    //@@       by a -1 value.
    //@@
    repeated int64 shape = 3;
  }

  //@@
  //@@  .. cpp:var:: string name
  //@@
  //@@     The model name.
  //@@
  string name = 1;

  //@@
  //@@  .. cpp:var:: string versions (repeated)
  //@@
  //@@     The versions of the model.
  //@@
  repeated string versions = 2;

  //@@
  //@@  .. cpp:var:: string platform
  //@@
  //@@     The model's platform.
  //@@
  string platform = 3;

  //@@
  //@@  .. cpp:var:: TensorMetadata inputs (repeated)
  //@@
  //@@     The model's inputs.
  //@@
  repeated TensorMetadata inputs = 4;

  //@@
  //@@  .. cpp:var:: TensorMetadata outputs (repeated)
  //@@
  //@@     The model's outputs.
  //@@
  repeated TensorMetadata outputs = 5;
}

//@@
//@@.. cpp:var:: message InferParameter
//@@
//@@   An inference parameter value.
//@@
message InferParameter
{
  //@@  .. cpp:var:: oneof parameter_choice
  //@@
  //@@     The parameter value can be a string, an int64,
  //@@     an uint64, a double, or a boolean
  //@@
  //@@     Note: double and uint64 are currently
  //@@           placeholders for future use and
  //@@           are not supported for custom parameters
  //@@
  oneof parameter_choice
  {
    //@@    .. cpp:var:: bool bool_param
    //@@
    //@@       A boolean parameter value.
    //@@
    bool bool_param = 1;

    //@@    .. cpp:var:: int64 int64_param
    //@@
    //@@       An int64 parameter value.
    //@@
    int64 int64_param = 2;

    //@@    .. cpp:var:: string string_param
    //@@
    //@@       A string parameter value.
    //@@
    string string_param = 3;

    //@@    .. cpp:var:: double double_param
    //@@
    //@@       A double parameter value.
    //@@
    double double_param = 4;

    //@@    .. cpp:var:: uint64 uint64_param
    //@@
    //@@       A uint64 parameter value.
    //@@
    //@@       Not supported for custom parameters
    //@@
    uint64 uint64_param = 5;
  }
}

//@@
//@@.. cpp:var:: message InferTensorContents
//@@
//@@   The data contained in a tensor represented by the repeated type
//@@   that matches the tensor's data type. Protobuf oneof is not used
//@@   because oneofs cannot contain repeated fields.
//@@
message InferTensorContents
{
  //@@
  //@@  .. cpp:var:: bool bool_contents (repeated)
  //@@
  //@@     Representation for BOOL data type. The size must match what is
  //@@     expected by the tensor's shape. The contents must be the flattened,
  //@@     one-dimensional, row-major order of the tensor elements.
  //@@
  repeated bool bool_contents = 1;

  //@@
  //@@  .. cpp:var:: int32 int_contents (repeated)
  //@@
  //@@     Representation for INT8, INT16, and INT32 data types. The size
  //@@     must match what is expected by the tensor's shape. The contents
  //@@     must be the flattened, one-dimensional, row-major order of the
  //@@     tensor elements.
  //@@
  repeated int32 int_contents = 2;

  //@@
  //@@  .. cpp:var:: int64 int64_contents (repeated)
  //@@
  //@@     Representation for INT64 data types. The size must match what
  //@@     is expected by the tensor's shape. The contents must be the
  //@@     flattened, one-dimensional, row-major order of the tensor elements.
  //@@
  repeated int64 int64_contents = 3;

  //@@
  //@@  .. cpp:var:: uint32 uint_contents (repeated)
  //@@
  //@@     Representation for UINT8, UINT16, and UINT32 data types. The size
  //@@     must match what is expected by the tensor's shape. The contents
  //@@     must be the flattened, one-dimensional, row-major order of the
  //@@     tensor elements.
  //@@
  repeated uint32 uint_contents = 4;

  //@@
  //@@  .. cpp:var:: uint64 uint64_contents (repeated)
  //@@
  //@@     Representation for UINT64 data types. The size must match what
  //@@     is expected by the tensor's shape. The contents must be the
  //@@     flattened, one-dimensional, row-major order of the tensor elements.
  //@@
  repeated uint64 uint64_contents = 5;

  //@@
  //@@  .. cpp:var:: float fp32_contents (repeated)
  //@@
  //@@     Representation for FP32 data type. The size must match what is
  //@@     expected by the tensor's shape. The contents must be the flattened,
  //@@     one-dimensional, row-major order of the tensor elements.
  //@@
  repeated float fp32_contents = 6;

  //@@
  //@@  .. cpp:var:: double fp64_contents (repeated)
  //@@
  //@@     Representation for FP64 data type. The size must match what is
  //@@     expected by the tensor's shape. The contents must be the flattened,
  //@@     one-dimensional, row-major order of the tensor elements.
  //@@
  repeated double fp64_contents = 7;

  //@@
  //@@  .. cpp:var:: bytes bytes_contents (repeated)
  //@@
  //@@     Representation for BYTES data type. The size must match what is
  //@@     expected by the tensor's shape. The contents must be the flattened,
  //@@     one-dimensional, row-major order of the tensor elements.
  //@@
  repeated bytes bytes_contents = 8;
}

//@@
//@@.. cpp:var:: message ModelInferRequest
//@@
//@@   Request message for ModelInfer.
//@@
message ModelInferRequest
{
  //@@
  //@@  .. cpp:var:: message InferInputTensor
  //@@
  //@@     An input tensor for an inference request.
  //@@
  message InferInputTensor
  {
    //@@
    //@@    .. cpp:var:: string name
    //@@
    //@@       The tensor name.
    //@@
    string name = 1;

    //@@
    //@@    .. cpp:var:: string datatype
    //@@
    //@@       The tensor data type.
    //@@
    string datatype = 2;

    //@@
    //@@    .. cpp:var:: int64 shape (repeated)
    //@@
    //@@       The tensor shape.
    //@@
    repeated int64 shape = 3;

    //@@    .. cpp:var:: map<string,InferParameter> parameters
    //@@
    //@@       Optional inference input tensor parameters.
    //@@
    map<string, InferParameter> parameters = 4;

    //@@    .. cpp:var:: InferTensorContents contents
    //@@
    //@@       The tensor contents using a data-type format. This field
    //@@       must not be specified if tensor contents are being specified
    //@@       in ModelInferRequest.raw_input_contents.
    //@@
    InferTensorContents contents = 5;
  }

  //@@
  //@@  .. cpp:var:: message InferRequestedOutputTensor
  //@@
  //@@     An output tensor requested for an inference request.
  //@@
  message InferRequestedOutputTensor
  {
    //@@
    //@@    .. cpp:var:: string name
    //@@
    //@@       The tensor name.
    //@@
    string name = 1;

    //@@    .. cpp:var:: map<string,InferParameter> parameters
    //@@
    //@@       Optional requested output tensor parameters.
    //@@
    map<string, InferParameter> parameters = 2;
  }

  //@@  .. cpp:var:: string model_name
  //@@
  //@@     The name of the model to use for inferencing.
  //@@
  string model_name = 1;

  //@@  .. cpp:var:: string model_version
  //@@
  //@@     The version of the model to use for inference. If not
  //@@     given the latest/most-recent version of the model is used.
  //@@
  string model_version = 2;

  //@@  .. cpp:var:: string id
  //@@
  //@@     Optional identifier for the request. If specified will be
  //@@     returned in the response.
  //@@
  string id = 3;

  //@@  .. cpp:var:: map<string,InferParameter> parameters
  //@@
  //@@     Optional inference parameters.
  //@@
  map<string, InferParameter> parameters = 4;

  //@@
  //@@  .. cpp:var:: InferInputTensor inputs (repeated)
  //@@
  //@@     The input tensors for the inference.
  //@@
  repeated InferInputTensor inputs = 5;

  //@@
  //@@  .. cpp:var:: InferRequestedOutputTensor outputs (repeated)
  //@@
  //@@     The requested output tensors for the inference. Optional, if not
  //@@     specified all outputs specified in the model config will be
  //@@     returned.
  //@@
  repeated InferRequestedOutputTensor outputs = 6;

  //@@
  //@@  .. cpp:var:: bytes raw_input_contents
  //@@
  //@@     The data contained in an input tensor can be represented in
  //@@     "raw" bytes form or in the repeated type that matches the
  //@@     tensor's data type. Using the "raw" bytes form will
  //@@     typically allow higher performance due to the way protobuf
  //@@     allocation and reuse interacts with GRPC. For example, see
  //@@     https://github.com/grpc/grpc/issues/23231.
  //@@
  //@@     To use the raw representation 'raw_input_contents' must be
  //@@     initialized with data for each tensor in the same order as
  //@@     'inputs'. For each tensor, the size of this content must
  //@@     match what is expected by the tensor's shape and data
  //@@     type. The raw data must be the flattened, one-dimensional,
  //@@     row-major order of the tensor elements without any stride
  //@@     or padding between the elements. Note that the FP16 and BF16 data
  //@@     types must be represented as raw content as there is no
  //@@     specific data type for a 16-bit float type.
  //@@
  //@@     If this field is specified then InferInputTensor::contents
  //@@     must not be specified for any input tensor.
  //@@
  repeated bytes raw_input_contents = 7;
}

//@@
//@@.. cpp:var:: message ModelInferResponse
//@@
//@@   Response message for ModelInfer.
//@@
message ModelInferResponse
{
  //@@
  //@@  .. cpp:var:: message InferOutputTensor
  //@@
  //@@     An output tensor returned for an inference request.
  //@@
  message InferOutputTensor
  {
    //@@
    //@@    .. cpp:var:: string name
    //@@
    //@@       The tensor name.
    //@@
    string name = 1;

    //@@
    //@@    .. cpp:var:: string datatype
    //@@
    //@@       The tensor data type.
    //@@
    string datatype = 2;

    //@@
    //@@    .. cpp:var:: int64 shape (repeated)
    //@@
    //@@       The tensor shape.
    //@@
    repeated int64 shape = 3;

    //@@    .. cpp:var:: map<string,InferParameter> parameters
    //@@
    //@@       Optional output tensor parameters.
    //@@
    map<string, InferParameter> parameters = 4;

    //@@    .. cpp:var:: InferTensorContents contents
    //@@
    //@@       The tensor contents using a data-type format. This field
    //@@       must not be specified if tensor contents are being specified
    //@@       in ModelInferResponse.raw_output_contents.
    //@@
    InferTensorContents contents = 5;
  }

  //@@  .. cpp:var:: string model_name
  //@@
  //@@     The name of the model used for inference.
  //@@
  string model_name = 1;

  //@@  .. cpp:var:: string model_version
  //@@
  //@@     The version of the model used for inference.
  //@@
  string model_version = 2;

  //@@  .. cpp:var:: string id
  //@@
  //@@     The id of the inference request if one was specified.
  //@@
  string id = 3;

  //@@  .. cpp:var:: map<string,InferParameter> parameters
  //@@
  //@@     Optional inference response parameters.
  //@@
  map<string, InferParameter> parameters = 4;

  //@@
  //@@  .. cpp:var:: InferOutputTensor outputs (repeated)
  //@@
  //@@     The output tensors holding inference results.
  //@@
  repeated InferOutputTensor outputs = 5;

  //@@
  //@@  .. cpp:var:: bytes raw_output_contents
  //@@
  //@@     The data contained in an output tensor can be represented in
  //@@     "raw" bytes form or in the repeated type that matches the
  //@@     tensor's data type. Using the "raw" bytes form will
  //@@     typically allow higher performance due to the way protobuf
  //@@     allocation and reuse interacts with GRPC. For example, see
  //@@     https://github.com/grpc/grpc/issues/23231.
  //@@
  //@@     To use the raw representation 'raw_output_contents' must be
  //@@     initialized with data for each tensor in the same order as
  //@@     'outputs'. For each tensor, the size of this content must
  //@@     match what is expected by the tensor's shape and data
  //@@     type. The raw data must be the flattened, one-dimensional,
  //@@     row-major order of the tensor elements without any stride
  //@@     or padding between the elements. Note that the FP16 and BF16 data
  //@@     types must be represented as raw content as there is no
  //@@     specific data type for a 16-bit float type.
  //@@
  //@@     If this field is specified then InferOutputTensor::contents
  //@@     must not be specified for any output tensor.
  //@@
  repeated bytes raw_output_contents = 6;
}

//@@
//@@.. cpp:var:: message ModelStreamInferResponse
//@@
//@@   Response message for ModelStreamInfer.
//@@
message ModelStreamInferResponse
{
  //@@
  //@@  .. cpp:var:: string error_message
  //@@
  //@@     The message describing the error. The empty message
  //@@     indicates the inference was successful without errors.
  //@@
  string error_message = 1;

  //@@
  //@@  .. cpp:var:: ModelInferResponse infer_response
  //@@
  //@@     Holds the results of the request.
  //@@
  ModelInferResponse infer_response = 2;
}

//@@
//@@.. cpp:var:: message ModelConfigRequest
//@@
//@@   Request message for ModelConfig.
//@@
message ModelConfigRequest
{
  //@@
  //@@  .. cpp:var:: string name
  //@@
  //@@     The name of the model.
  //@@
  string name = 1;

  //@@  .. cpp:var:: string version
  //@@
  //@@     The version of the model. If not given the model version
  //@@     is selected automatically based on the version policy.
  //@@
  string version = 2;
}

//@@
//@@.. cpp:var:: message ModelConfigResponse
//@@
//@@   Response message for ModelConfig.
//@@
message ModelConfigResponse
{
  //@@
  //@@  .. cpp:var:: ModelConfig config
  //@@
  //@@     The model configuration.
  //@@
  ModelConfig config = 1;
}

//@@
//@@.. cpp:var:: message ModelRepositoryParameter
//@@
//@@   An model repository parameter value.
//@@
message ModelRepositoryParameter
{
  //@@  .. cpp:var:: oneof parameter_choice
  //@@
  //@@     The parameter value can be a string, an int64 or
  //@@     a boolean
  //@@
  oneof parameter_choice
  {
    //@@    .. cpp:var:: bool bool_param
    //@@
    //@@       A boolean parameter value.
    //@@
    bool bool_param = 1;

    //@@    .. cpp:var:: int64 int64_param
    //@@
    //@@       An int64 parameter value.
    //@@
    int64 int64_param = 2;

    //@@    .. cpp:var:: string string_param
    //@@
    //@@       A string parameter value.
    //@@
    string string_param = 3;

    //@@    .. cpp:var:: bytes bytes_param
    //@@
    //@@       A bytes parameter value.
    //@@
    bytes bytes_param = 4;
  }
}