Struct IQuantizeLayer

Source

pub struct IQuantizeLayer { /* private fields */ }

Expand description

! ! \class IQuantizeLayer ! ! \brief A Quantize layer in a network definition. ! ! This layer accepts a floating-point data input tensor, and uses the scale and zeroPt inputs to ! quantize the data according to: ! \p output = clamp(round(\p input / \p scale) + \p zeroPt) ! ! Rounding type is rounding-to-nearest ties-to-even (https://en.wikipedia.org/wiki/Rounding#Round_half_to_even). ! Clamping range according to data type: ! - FP8: [-448, 448] ! - INT4: [-8, 7] ! - INT8: [-128, 127] ! ! The first input (index 0) is the tensor to be quantized. ! The second (index 1) and third (index 2) are the scale and zero point respectively. ! \p scale and \p zeroPt should have identical dimensions, and rank lower or equal to 2. ! ! The \p zeroPt tensor is optional, and if not set, will be assumed to be zero. Its data type must match the ! output data type. \p zeroPt must only contain zero-valued coefficients, because only symmetric quantization is ! supported. ! The \p scale value must be a scalar for per-tensor quantization, a 1D tensor for per-channel quantization, or the ! same rank as the input tensor for block quantization. All \p scale coefficients must have strictly positive values. ! The size of the 1D \p scale tensor must match the size of the quantization axis. For block quantization, the shape ! of \p scale tensor must match the shape of the input, except for the blocking dimension (the last or second to last ! dimension). The size of \p zeroPt must match the size of \p scale. ! ! The subgraph which terminates with the \p zeroPt tensor must be a build-time constant containing only zeros. ! The output type, if constrained, must be constrained to DataType::kINT8, DataType::kFP8, DataType::kINT4 or ! DataType::kFP4. The input type, if constrained, must be constrained to DataType::kFLOAT, DataType::kHALF, or ! DataType::kBF16. The output size is the same as the input size. The quantization axis is in reference to the input ! tensor’s dimensions. ! ! IQuantizeLayer supports DataType::kFLOAT, DataType::kHALF, or DataType::kBF16 precision and will default to ! DataType::kFLOAT precision during instantiation. For strongly typed networks, if the scale data type is ! DataType::kHALF or DataType::kBF16, it must match the input data type. For MXFP8 quantization, the \p scale ! data type must be DataType::kE8M0. ! ! IQuantizeLayer supports DataType::kINT8, DataType::kFP8, DataType::kINT4 or DataType::kFP4 output. ! ! As an example of the operation of this layer, imagine a 4D NCHW activation input which can be quantized using a ! single scale coefficient (referred to as per-tensor quantization): ! For each n in N: ! For each c in C: ! For each h in H: ! For each w in W: ! output[n,c,h,w] = clamp(round(\p input[n,c,h,w] / \p scale) + \p zeroPt) ! ! Per-channel quantization is supported only for weight inputs. Thus, Activations cannot be quantized per-channel. ! As an example of per-channel operation, imagine a 4D KCRS weights input and K (dimension 0) as the quantization ! axis. The scale is an array of coefficients, and must have the same size as the quantization axis. ! For each k in K: ! For each c in C: ! For each r in R: ! For each s in S: ! output[k,c,r,s] = clamp(round(\p input[k,c,r,s] / \p scale[k]) + \p zeroPt[k]) ! ! Block quantization is supported for input types DataType::kFP4, DataType::kFP8 and DataType::kINT4. ! As an example of blocked operation, imagine a 2D RS input with R (dimension 0) as the blocking axis and B as the ! block size. The scale is a 2D array of coefficients, with dimensions (R//B, S). ! For each r in R: ! For each s in S: ! output[r,s] = clamp(round(\p input[r,s] / \p scale[r//B, s]) + \p zeroPt[r//B, s]) ! ! \note Only symmetric quantization is supported. ! \note Currently the only allowed build-time constant \p zeroPt subgraphs are: ! 1. Constant -> Quantize ! 2. Constant -> Cast -> Quantize ! ! \note The input tensor for this layer must not be a scalar. ! ! \warning Do not inherit from this class, as doing so will break forward-compatibility of the API and ABI. !

IQuantizeLayer

Struct IQuantizeLayer Copy item path

Implementations§

impl IQuantizeLayer

pub fn getAxis(self: &IQuantizeLayer) -> i32

pub fn setAxis(self: Pin<&mut IQuantizeLayer>, axis: i32)

pub fn setBlockShape( self: Pin<&mut IQuantizeLayer>, blockShape: &Dims64, ) -> bool

pub fn getBlockShape(self: &IQuantizeLayer) -> Dims64

pub fn setToType(self: Pin<&mut IQuantizeLayer>, toType: DataType)

pub fn getToType(self: &IQuantizeLayer) -> DataType

Trait Implementations§

impl AsRef<ILayer> for IQuantizeLayer

fn as_ref(self: &IQuantizeLayer) -> &ILayer

impl ExternType for IQuantizeLayer

type Id = (n, v, i, n, f, e, r, _1, (), I, Q, u, a, n, t, i, z, e, L, a, y, e, r)

type Kind = Opaque

impl MakeCppStorage for IQuantizeLayer

unsafe fn allocate_uninitialized_cpp_storage() -> *mut IQuantizeLayer

unsafe fn free_uninitialized_cpp_storage(arg0: *mut IQuantizeLayer)

Auto Trait Implementations§

impl !Freeze for IQuantizeLayer

impl !RefUnwindSafe for IQuantizeLayer

impl !Send for IQuantizeLayer

impl !Sync for IQuantizeLayer

impl !Unpin for IQuantizeLayer

impl UnwindSafe for IQuantizeLayer

Blanket Implementations§

impl<T> Any for Twhere T: 'static + ?Sized,

fn type_id(&self) -> TypeId

impl<T> Borrow<T> for Twhere T: ?Sized,

fn borrow(&self) -> &T

impl<T> BorrowMut<T> for Twhere T: ?Sized,

fn borrow_mut(&mut self) -> &mut T

impl<T> From<T> for T

fn from(t: T) -> T

impl<T, U> Into<U> for Twhere U: From<T>,

fn into(self) -> U

impl<T, U> TryFrom<U> for Twhere U: Into<T>,

type Error = Infallible

fn try_from(value: U) -> Result<T, <T as TryFrom<U>>::Error>

impl<T, U> TryInto<U> for Twhere U: TryFrom<T>,

type Error = <U as TryFrom<T>>::Error

fn try_into(self) -> Result<U, <U as TryFrom<T>>::Error>

Struct IQuantizeLayer

impl<T> Any for T
where T: 'static + ?Sized,

impl<T> Borrow<T> for T
where T: ?Sized,

impl<T> BorrowMut<T> for T
where T: ?Sized,

impl<T, U> Into<U> for T
where U: From<T>,

impl<T, U> TryFrom<U> for T
where U: Into<T>,

impl<T, U> TryInto<U> for T
where U: TryFrom<T>,