triton_client::inference::model_optimization_policy

Struct ExecutionAccelerators

pub struct ExecutionAccelerators {
    pub gpu_execution_accelerator: Vec<Accelerator>,
    pub cpu_execution_accelerator: Vec<Accelerator>,
}

Expand description

@@ @@ .. cpp:var:: message ExecutionAccelerators @@ @@ Specify the preferred execution accelerators to be used to execute @@ the model. Currently only recognized by ONNX Runtime backend and @@ TensorFlow backend. @@ @@ For ONNX Runtime backend, it will deploy the model with the execution @@ accelerators by priority, the priority is determined based on the @@ order that they are set, i.e. the provider at the front has highest @@ priority. Overall, the priority will be in the following order: @@ <gpu_execution_accelerator> (if instance is on GPU) @@ CUDA Execution Provider (if instance is on GPU) @@ <cpu_execution_accelerator> @@ Default CPU Execution Provider @@

Fields§

§gpu_execution_accelerator: Vec<Accelerator>

@@ .. cpp:var:: Accelerator gpu_execution_accelerator (repeated) @@ @@ The preferred execution provider to be used if the model instance @@ is deployed on GPU. @@ @@ For ONNX Runtime backend, possible value is “tensorrt” as name, @@ and no parameters are required. @@ @@ For TensorFlow backend, possible values are “tensorrt”, @@ “auto_mixed_precision”, “gpu_io”. @@ @@ For “tensorrt”, the following parameters can be specified: @@ “precision_mode”: The precision used for optimization. @@ Allowed values are “FP32” and “FP16”. Default value is “FP32”. @@ @@ “max_cached_engines”: The maximum number of cached TensorRT @@ engines in dynamic TensorRT ops. Default value is 100. @@ @@ “minimum_segment_size”: The smallest model subgraph that will @@ be considered for optimization by TensorRT. Default value is 3. @@ @@ “max_workspace_size_bytes”: The maximum GPU memory the model @@ can use temporarily during execution. Default value is 1GB. @@ @@ For “auto_mixed_precision”, no parameters are required. If set, @@ the model will try to use FP16 for better performance. @@ This optimization can not be set with “tensorrt”. @@ @@ For “gpu_io”, no parameters are required. If set, the model will @@ be executed using TensorFlow Callable API to set input and output @@ tensors in GPU memory if possible, which can reduce data transfer @@ overhead if the model is used in ensemble. However, the Callable @@ object will be created on model creation and it will request all @@ outputs for every model execution, which may impact the @@ performance if a request does not require all outputs. This @@ optimization will only take affect if the model instance is @@ created with KIND_GPU. @@

§cpu_execution_accelerator: Vec<Accelerator>

@@ .. cpp:var:: Accelerator cpu_execution_accelerator (repeated) @@ @@ The preferred execution provider to be used if the model instance @@ is deployed on CPU. @@ @@ For ONNX Runtime backend, possible value is “openvino” as name, @@ and no parameters are required. @@

Struct ExecutionAcceleratorsCopy item path

Fields§

Trait Implementations§

impl Clone for ExecutionAccelerators

fn clone(&self) -> ExecutionAccelerators

fn clone_from(&mut self, source: &Self)

impl Debug for ExecutionAccelerators

fn fmt(&self, f: &mut Formatter<'_>) -> Result

impl Default for ExecutionAccelerators

fn default() -> Self

impl Message for ExecutionAccelerators

fn encoded_len(&self) -> usize

fn clear(&mut self)

fn encode<B>(&self, buf: &mut B) -> Result<(), EncodeError>where B: BufMut, Self: Sized,

fn encode_to_vec(&self) -> Vec<u8> ⓘwhere Self: Sized,

fn encode_length_delimited<B>(&self, buf: &mut B) -> Result<(), EncodeError>where B: BufMut, Self: Sized,

fn encode_length_delimited_to_vec(&self) -> Vec<u8> ⓘwhere Self: Sized,

fn decode<B>(buf: B) -> Result<Self, DecodeError>where B: Buf, Self: Default,

fn decode_length_delimited<B>(buf: B) -> Result<Self, DecodeError>where B: Buf, Self: Default,

fn merge<B>(&mut self, buf: B) -> Result<(), DecodeError>where B: Buf, Self: Sized,

fn merge_length_delimited<B>(&mut self, buf: B) -> Result<(), DecodeError>where B: Buf, Self: Sized,

impl PartialEq for ExecutionAccelerators

fn eq(&self, other: &ExecutionAccelerators) -> bool

fn ne(&self, other: &Rhs) -> bool

impl StructuralPartialEq for ExecutionAccelerators

Auto Trait Implementations§

impl Freeze for ExecutionAccelerators

impl RefUnwindSafe for ExecutionAccelerators

impl Send for ExecutionAccelerators

impl Sync for ExecutionAccelerators

impl Unpin for ExecutionAccelerators

impl UnwindSafe for ExecutionAccelerators

Blanket Implementations§

impl<T> Any for Twhere T: 'static + ?Sized,

fn type_id(&self) -> TypeId

impl<T> Borrow<T> for Twhere T: ?Sized,

fn borrow(&self) -> &T

impl<T> BorrowMut<T> for Twhere T: ?Sized,

fn borrow_mut(&mut self) -> &mut T

impl<T> CloneToUninit for Twhere T: Clone,

unsafe fn clone_to_uninit(&self, dest: *mut u8)

impl<T> From<T> for T

fn from(t: T) -> T

impl<T> Instrument for T

fn instrument(self, span: Span) -> Instrumented<Self>

fn in_current_span(self) -> Instrumented<Self>

impl<T, U> Into<U> for Twhere U: From<T>,

fn into(self) -> U

impl<T> IntoRequest<T> for T

fn into_request(self) -> Request<T>

impl<T> ToOwned for Twhere T: Clone,

type Owned = T

fn to_owned(&self) -> T

fn clone_into(&self, target: &mut T)

impl<T, U> TryFrom<U> for Twhere U: Into<T>,

type Error = Infallible

fn try_from(value: U) -> Result<T, <T as TryFrom<U>>::Error>

impl<T, U> TryInto<U> for Twhere U: TryFrom<T>,

type Error = <U as TryFrom<T>>::Error

fn try_into(self) -> Result<U, <U as TryFrom<T>>::Error>

impl<V, T> VZip<V> for Twhere V: MultiLane<T>,

fn vzip(self) -> V

impl<T> WithSubscriber for T

fn with_subscriber<S>(self, subscriber: S) -> WithDispatch<Self>where S: Into<Dispatch>,

fn with_current_subscriber(self) -> WithDispatch<Self>

Struct ExecutionAccelerators

fn encode<B>(&self, buf: &mut B) -> Result<(), EncodeError>
where B: BufMut, Self: Sized,

fn encode_to_vec(&self) -> Vec<u8> ⓘ
where Self: Sized,

fn encode_length_delimited<B>(&self, buf: &mut B) -> Result<(), EncodeError>
where B: BufMut, Self: Sized,

fn encode_length_delimited_to_vec(&self) -> Vec<u8> ⓘ
where Self: Sized,

fn decode<B>(buf: B) -> Result<Self, DecodeError>
where B: Buf, Self: Default,

fn decode_length_delimited<B>(buf: B) -> Result<Self, DecodeError>
where B: Buf, Self: Default,

fn merge<B>(&mut self, buf: B) -> Result<(), DecodeError>
where B: Buf, Self: Sized,

fn merge_length_delimited<B>(&mut self, buf: B) -> Result<(), DecodeError>
where B: Buf, Self: Sized,

impl<T> Any for T
where T: 'static + ?Sized,

impl<T> Borrow<T> for T
where T: ?Sized,

impl<T> BorrowMut<T> for T
where T: ?Sized,

impl<T> CloneToUninit for T
where T: Clone,

impl<T, U> Into<U> for T
where U: From<T>,

impl<T> ToOwned for T
where T: Clone,

impl<T, U> TryFrom<U> for T
where U: Into<T>,

impl<T, U> TryInto<U> for T
where U: TryFrom<T>,

impl<V, T> VZip<V> for T
where V: MultiLane<T>,

fn with_subscriber<S>(self, subscriber: S) -> WithDispatch<Self>
where S: Into<Dispatch>,