tritonserver_rs/
lib.rs

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
//! # **Perform easy and efficient ML model inference**
//!
//! This crate is designed to run **any** Machine Learning model on **any** architecture with ease and efficiency.  
//! It leverages the [Triton Inference Server](https://github.com/triton-inference-server/server)
//! (specifically the [Triton C library](https://github.com/triton-inference-server/core)) and provides a similar API with comparable advantages.
//! However, **Tritonserver-rs** allows you to build the inference server locally, offering significant performance benefits.
//! Check the [benchmark](https://github.com/3xMike/tritonserver-rs/blob/main/BENCH.md) for more details.
//!
//! ---
//!
//! # Usage  
//!
//! Run inference in three simple steps:
//!
//! ## **Step 1. Prepare the model repository**  
//!
//! Organize your model files in the following structure:
//!
//! ```text
//! models/
//! ├── yolov8/
//! |    ├── config.pbtxt
//! |    ├── 1/
//! |    │   └── model.onnx
//! |    ├── 2/
//! |    │   └── model.onnx
//! |    └── `<other versions of yolov8>`/
//! └── `<other models>`/
//! ```
//!
//! **Rules**:  
//! - All models must be stored in the same root directory (`models/` in this example).  
//! - Each model resides in its own folder containing:
//!   - A `config.pbtxt` configuration file.
//!   - One or more subdirectories, each representing a version of the model and containing the model file (e.g., `model.onnx`).  
//!
//! ---
//!
//! ## **Step 2. Write the code**  
//!
//! Add **Tritonserver-rs** to your `Cargo.toml`:  
//!
//! ```toml
//! [dependencies]
//! tritonserver-rs = "0.1"
//! ```
//!
//! Then write your application code:  
//!
//! ```rust
//! use tritonserver_rs::{Buffer, options::Options, Server};
//! use std::time::Duration;
//!
//! // Configure server options.
//! let mut opts = Options::new("models/")?;
//!
//! opts.exit_timeout(Duration::from_secs(5))?
//!     .backend_directory("/opt/tritonserver/backends")?;
//!
//! // Create the server.
//! let server = Server::new(opts).await?;
//!
//! // Input data.
//! let image = image::open("/data/cats.jpg")?;
//! let image = image.as_flat_samples_u8();
//!
//! // Create a request (specify the model name and version).
//! let mut request = server.create_request("yolov8", 2)?;
//!
//! // Add input data and an allocator.
//! request
//!     .add_default_allocator()
//!     .add_input("IMAGE", Buffer::from(image))?;
//!
//! // Run inference.
//! let fut = request.infer_async()?;
//!
//! // Obtain results.
//! let response = fut.await?;
//! ```
//!
//! ---
//!
//! ## **Step 3. Deploy**
//!
//! Here is an example of how to deploy using `docker-compose.yml`:  
//!
//! ```yml
//! my_app:
//!   image: {DEV_IMAGE}
//!   volumes:
//!     - ./Cargo.toml:/project/
//!     - ./src:/project/src
//!     - ../models:/models
//!     - ../cats.jpg:/data/cats.jpg
//!   entrypoint: ["cargo", "run", "--manifest-path=/project/Cargo.toml"]
//! ```
//!
//! We recommend using Dockerfile.dev as `{DEV_IMAGE}`. For more details on suitable images and deployment instructions, see DEPLOY.md.  
//!
//! ---
//!
//! # **More Information**
//!
//! For further details, check out the following resources (in [github repo](https://github.com/3xMike/tritonserver-rs/blob/main)):  
//! - [Examples](https://github.com/3xMike/tritonserver-rs/blob/main/examples): Learn how to run various ML models using **Tritonserver-rs**, configure inference, prepare models, and deploy.  
//! - [Model configuration guide](https://github.com/3xMike/tritonserver-rs/blob/main/MODEL_CONFIGURATION.md).  
//! - [Build and deployment instructions](https://github.com/3xMike/tritonserver-rs/blob/main/DEPLOY.md).  
//! - [Benchmark results](https://github.com/3xMike/tritonserver-rs/blob/main/BENCH.md).  
//! - [Triton Inference Server guides](https://github.com/triton-inference-server/server/tree/main/docs/README.md).  
//!
//! ---
//!
//! # **Advantages of the Crate**
//!
//! - **Versatility**: Extensive configuration options for models and servers.  
//! - **High performance**: Optimized for maximum efficiency.  
//! - **Broad backend support**: Run PyTorch, ONNX, TensorFlow, TensorRT, OpenVINO, model pipelines, and custom backends out of the box.  
//! - **Compatibility**: Supports most GPUs and architectures.  
//! - **Multi-model handling**: Handle multiple models simultaneously.  
//! - **Prometheus integration**: Built-in support for monitoring.  
//! - **CUDA-optimized**: Directly handle model inputs and outputs on GPU memory.  
//! - **Dynamic server management**: Advanced runtime control features.  
//! - **Rust-based**: Enjoy the safety, speed, and concurrency benefits of Rust.

#![allow(clippy::bad_bit_mask)]

/// Macros to run some Cuda operations in context.
#[macro_use]
pub mod macros;

pub(crate) mod allocator;
#[cfg(feature = "gpu")]
/// Cuda context for managing device execution.
pub mod context;
/// Error types for Tritonserver-rs.
pub mod error;
/// Memory management utilities for model inference.
pub mod memory;
/// Metadata message serialization/deserialization.
pub mod message;
/// Performance metrics collection and reporting.
pub mod metrics;
/// Configuration options for Tritonserver-rs server.
pub mod options;
/// Model inference requests and server parameters.
pub mod parameter;
/// Request builder and utilities for Triton server inference.
pub mod request;
/// Response handling and parsing from Triton server.
pub mod response;
/// Server initialization and lifecycle management.
pub mod server;
pub(crate) mod sys {
    #![allow(
        non_camel_case_types,
        non_upper_case_globals,
        non_snake_case,
        dead_code,
        unused_imports
    )]
    include!(concat!(env!("OUT_DIR"), "/tritonserver.rs"));
}
/// Tracing utilities for debugging and profiling.
pub mod trace;

pub use crate::{
    error::{Error, ErrorCode},
    memory::{Buffer, MemoryType},
    request::{Allocator, Request},
    response::Response,
    server::Server,
    sys::{TRITONSERVER_API_VERSION_MAJOR, TRITONSERVER_API_VERSION_MINOR},
};
#[cfg(feature = "gpu")]
pub use context::{get_context, init_cuda};

use std::ffi::CString;

/// Get the TRITONBACKEND API version supported by the Triton library.
/// This value can be compared against the TRITONSERVER_API_VERSION_MAJOR and TRITONSERVER_API_VERSION_MINOR used to build the client to ensure that Triton shared library is compatible with the client.
pub fn api_version() -> Result<(u32, u32), Error> {
    let mut major: u32 = 0;
    let mut minor: u32 = 0;

    triton_call!(
        sys::TRITONSERVER_ApiVersion(&mut major as *mut _, &mut minor as *mut _),
        (major, minor)
    )
}

pub(crate) fn to_cstring<S: AsRef<str>>(value: S) -> Result<CString, Error> {
    CString::new(value.as_ref().as_bytes())
        .map_err(|err| Error::new(ErrorCode::InvalidArg, format!("{}", err)))
}

#[cfg(test)]
mod tests {
    use super::*;

    #[test]
    fn api() {
        let (major, minor) = api_version().unwrap();

        assert_eq!(major, TRITONSERVER_API_VERSION_MAJOR);
        assert_eq!(minor, TRITONSERVER_API_VERSION_MINOR);
    }
}