tritonserver_rs/lib.rs
1//! # **Perform easy and efficient ML model inference**
2//!
3//! This crate is designed to run **any** Machine Learning model on **any** architecture with ease and efficiency.
4//! It leverages the [Triton Inference Server](https://github.com/triton-inference-server/server)
5//! (specifically the [Triton C library](https://github.com/triton-inference-server/core)) and provides a similar API with comparable advantages.
6//! However, **Tritonserver-rs** allows you to build the inference server locally, offering significant performance benefits.
7//! Check the [benchmark](https://github.com/3xMike/tritonserver-rs/blob/main/BENCH.md) for more details.
8//!
9//! ---
10//!
11//! # Usage
12//!
13//! Run inference in three simple steps:
14//!
15//! ## **Step 1. Prepare the model repository**
16//!
17//! Organize your model files in the following structure:
18//!
19//! ```text
20//! models/
21//! ├── yolov8/
22//! | ├── config.pbtxt
23//! | ├── 1/
24//! | │ └── model.onnx
25//! | ├── 2/
26//! | │ └── model.onnx
27//! | └── `<other versions of yolov8>`/
28//! └── `<other models>`/
29//! ```
30//!
31//! **Rules**:
32//! - All models must be stored in the same root directory (`models/` in this example).
33//! - Each model resides in its own folder containing:
34//! - A `config.pbtxt` configuration file.
35//! - One or more subdirectories, each representing a version of the model and containing the model file (e.g., `model.onnx`).
36//!
37//! ---
38//!
39//! ## **Step 2. Write the code**
40//!
41//! Add **Tritonserver-rs** to your `Cargo.toml`:
42//!
43//! ```toml
44//! [dependencies]
45//! tritonserver-rs = "0.1"
46//! ```
47//!
48//! Then write your application code:
49//!
50//! ```rust
51//! use tritonserver_rs::{Buffer, options::Options, Server};
52//! use std::time::Duration;
53//!
54//! // Configure server options.
55//! let mut opts = Options::new("models/")?;
56//!
57//! opts.exit_timeout(Duration::from_secs(5))?
58//! .backend_directory("/opt/tritonserver/backends")?;
59//!
60//! // Create the server.
61//! let server = Server::new(opts).await?;
62//!
63//! // Input data.
64//! let image = image::open("/data/cats.jpg")?;
65//! let image = image.as_flat_samples_u8();
66//!
67//! // Create a request (specify the model name and version).
68//! let mut request = server.create_request("yolov8", 2)?;
69//!
70//! // Add input data and an allocator.
71//! request
72//! .add_default_allocator()
73//! .add_input("IMAGE", Buffer::from(image))?;
74//!
75//! // Run inference.
76//! let fut = request.infer_async()?;
77//!
78//! // Obtain results.
79//! let response = fut.await?;
80//! ```
81//!
82//! ---
83//!
84//! ## **Step 3. Deploy**
85//!
86//! Here is an example of how to deploy using `docker-compose.yml`:
87//!
88//! ```yml
89//! my_app:
90//! image: {DEV_IMAGE}
91//! volumes:
92//! - ./Cargo.toml:/project/
93//! - ./src:/project/src
94//! - ../models:/models
95//! - ../cats.jpg:/data/cats.jpg
96//! entrypoint: ["cargo", "run", "--manifest-path=/project/Cargo.toml"]
97//! ```
98//!
99//! We recommend using Dockerfile.dev as `{DEV_IMAGE}`. For more details on suitable images and deployment instructions, see DEPLOY.md.
100//!
101//! ---
102//!
103//! # **More Information**
104//!
105//! For further details, check out the following resources (in [github repo](https://github.com/3xMike/tritonserver-rs/blob/main)):
106//! - [Examples](https://github.com/3xMike/tritonserver-rs/blob/main/examples): Learn how to run various ML models using **Tritonserver-rs**, configure inference, prepare models, and deploy.
107//! - [Model configuration guide](https://github.com/3xMike/tritonserver-rs/blob/main/MODEL_CONFIGURATION.md).
108//! - [Build and deployment instructions](https://github.com/3xMike/tritonserver-rs/blob/main/DEPLOY.md).
109//! - [Benchmark results](https://github.com/3xMike/tritonserver-rs/blob/main/BENCH.md).
110//! - [Triton Inference Server guides](https://github.com/triton-inference-server/server/tree/main/docs/README.md).
111//!
112//! ---
113//!
114//! # **Advantages of the Crate**
115//!
116//! - **Versatility**: Extensive configuration options for models and servers.
117//! - **High performance**: Optimized for maximum efficiency.
118//! - **Broad backend support**: Run PyTorch, ONNX, TensorFlow, TensorRT, OpenVINO, model pipelines, and custom backends out of the box.
119//! - **Compatibility**: Supports most GPUs and architectures.
120//! - **Multi-model handling**: Handle multiple models simultaneously.
121//! - **Prometheus integration**: Built-in support for monitoring.
122//! - **CUDA-optimized**: Directly handle model inputs and outputs on GPU memory.
123//! - **Dynamic server management**: Advanced runtime control features.
124//! - **Rust-based**: Enjoy the safety, speed, and concurrency benefits of Rust.
125//!
126//! # Tritonserver C-lib API version
127//! `1.33` (Minimal TRITON_CONTAINER_VERSION=23.07).
128
129#![allow(clippy::bad_bit_mask)]
130
131/// Macros to run some Cuda operations in context.
132#[macro_use]
133pub mod macros;
134
135pub(crate) mod allocator;
136#[cfg(feature = "gpu")]
137/// Cuda context for managing device execution.
138pub mod context;
139/// Error types for Tritonserver-rs.
140pub mod error;
141/// Memory management utilities for model inference.
142pub mod memory;
143/// Metadata message serialization/deserialization.
144pub mod message;
145/// Performance metrics collection and reporting.
146pub mod metrics;
147/// Configuration options for Tritonserver-rs server.
148pub mod options;
149/// Model inference requests and server parameters.
150pub mod parameter;
151/// Request builder and utilities for Triton server inference.
152pub mod request;
153/// Response handling and parsing from Triton server.
154pub mod response;
155/// Server initialization and lifecycle management.
156pub mod server;
157pub(crate) mod sys {
158 #![allow(
159 non_camel_case_types,
160 non_upper_case_globals,
161 non_snake_case,
162 dead_code,
163 unused_imports,
164 rustdoc::invalid_html_tags
165 )]
166 include!(concat!(env!("OUT_DIR"), "/tritonserver.rs"));
167}
168pub mod trace;
169
170pub use crate::{
171 error::{Error, ErrorCode},
172 memory::{Buffer, MemoryType},
173 request::{Allocator, Request},
174 response::Response,
175 server::Server,
176 sys::{TRITONSERVER_API_VERSION_MAJOR, TRITONSERVER_API_VERSION_MINOR},
177};
178#[cfg(feature = "gpu")]
179pub use context::{get_context, init_cuda};
180
181use std::{
182 ffi::{CStr, CString},
183 os::{raw::c_char, unix::ffi::OsStrExt as _},
184 path::Path,
185};
186
187/// Get the TRITONBACKEND API version supported by the Triton library.
188/// This value can be compared against the TRITONSERVER_API_VERSION_MAJOR and TRITONSERVER_API_VERSION_MINOR used to build the client to ensure that Triton shared library is compatible with the client.
189pub fn api_version() -> Result<(u32, u32), Error> {
190 let mut major: u32 = 0;
191 let mut minor: u32 = 0;
192
193 triton_call!(
194 sys::TRITONSERVER_ApiVersion(&mut major as *mut _, &mut minor as *mut _),
195 (major, minor)
196 )
197}
198
199pub(crate) fn to_cstring<S: AsRef<str>>(value: S) -> Result<CString, Error> {
200 CString::new(value.as_ref().as_bytes())
201 .map_err(|err| Error::new(ErrorCode::InvalidArg, format!("{}", err)))
202}
203
204pub(crate) fn path_to_cstring<P: AsRef<Path>>(value: P) -> Result<CString, Error> {
205 value
206 .as_ref()
207 .canonicalize()
208 .map_err(|err| Error::new(ErrorCode::InvalidArg, err.to_string()))
209 .and_then(|path| {
210 CString::new(path.as_os_str().as_bytes())
211 .map_err(|err| Error::new(ErrorCode::InvalidArg, err.to_string()))
212 })
213}
214
215pub(crate) fn from_char_array(value: *const c_char) -> String {
216 assert!(!value.is_null());
217 unsafe { CStr::from_ptr(value) }
218 .to_str()
219 .unwrap_or(error::CSTR_CONVERT_ERROR_PLUG)
220 .to_string()
221}
222
223#[cfg(test)]
224mod tests {
225 use super::*;
226
227 #[test]
228 fn api() {
229 let (major, minor) = api_version().unwrap();
230
231 assert_eq!(major, TRITONSERVER_API_VERSION_MAJOR);
232 assert_eq!(minor, TRITONSERVER_API_VERSION_MINOR);
233 }
234}