Struct TensorCore

Source

pub struct TensorCore {
    pub dims: (usize, usize, usize),
    pub threads: usize,
    pub elements_per_thread: (usize, usize, usize),
    pub dtype_in: DType,
    pub dtype_out: DType,
    pub opts: SmallVec<[TcOpt; 8]>,
    pub swizzle: ((SmallVec<[SwizzleAxis; 8]>, SmallVec<[SwizzleAxis; 8]>, SmallVec<[SwizzleAxis; 8]>), (SmallVec<[SwizzleAxis; 8]>, SmallVec<[SwizzleAxis; 8]>, SmallVec<[SwizzleAxis; 8]>)),
    pub pack_a: bool,
    pub tile_grid: (usize, usize),
}

Expand description

Tensor core configuration for hardware-accelerated matrix multiplication.

Describes a specific matrix multiplication unit with fixed dimensions and data types. Based on NVIDIA’s WMMA (Warp Matrix Multiply-Accumulate) API and similar accelerators.

§Matrix Dimensions

Tensor cores perform: C[M,N] += A[M,K] × B[K,N]

dims.0 (N): Number of output columns
dims.1 (M): Number of output rows
dims.2 (K): Reduction dimension size

§Example

NVIDIA Tensor Core 16x16x16:

Processes 16×16 output tile
Accumulates across 16 K elements
Uses 32 threads (warp size)
Each thread handles multiple elements via opts

Fields§

§dims: (usize, usize, usize)

Matrix dimensions (N, M, K).

§threads: usize

Number of threads required (typically warp size: 32 for CUDA, 64 for AMD).

§elements_per_thread: (usize, usize, usize)

Elements per thread in each dimension (N, M, K).

Describes how the matrix is distributed across threads. Example: (2, 2, 4) means each thread handles 2×2 output elements and processes 4 K elements.

§dtype_in: DType

Input matrix data type (A and B matrices).

§dtype_out: DType

Output/accumulator data type (C matrix).

§opts: SmallVec<[TcOpt; 8]>

Optimization sequence for tensor core application.

A sequence of operations to transform ranges. Each operation splits a dimension (N, M, or K) and assigns it to a new axis type.

Example: [Upcast(0), Local(0), Local(0), Local(1), Local(1), Local(1), Upcast(1)]

Upcast N once
Local split N twice
Local split M three times
Upcast M once

Uses SmallVec to avoid heap allocation for typical tensor cores (≤8 ops).

§

swizzle: ((SmallVec<[SwizzleAxis; 8]>, SmallVec<[SwizzleAxis; 8]>, SmallVec<[SwizzleAxis; 8]>), (SmallVec<[SwizzleAxis; 8]>, SmallVec<[SwizzleAxis; 8]>, SmallVec<[SwizzleAxis; 8]>))

Swizzle patterns for input permutation.

Describes how to permute input matrices to match hardware layout. Format: ((A_local, A_upcast, A_reduce), (B_local, B_upcast, B_reduce))

Each tuple contains axis references that describe the permutation pattern for optimal memory access. The first tuple is for matrix A, second for B.

Uses SmallVec to avoid heap allocation for typical swizzles (≤8 axes per vec).

§pack_a: bool

Pre-pack operand A into contiguous scratch buffer before the reduction loop. Beneficial when the A operand has non-unit stride access (e.g., AMX row-major matmul).

§tile_grid: (usize, usize)

Tile grid for multi-FMA batching (tile_y_count, tile_x_count).

When > (1, 1), the codegen emits load-pair instructions and multiple FMAs per K iteration to compute a grid of output tiles simultaneously. Default is (1, 1) for single-tile operation.

Struct TensorCore Copy item path

§Matrix Dimensions

§Example

Fields§

Implementations§

impl TensorCore

pub fn get_reduce_axes(&self) -> Vec<(usize, usize)>

pub fn upcast_axes(&self) -> (Vec<usize>, Vec<usize>, Vec<usize>)

pub fn sm75_tensor_cores() -> Vec<TensorCore>

pub fn sm80_tensor_cores(allow_tf32: bool) -> Vec<TensorCore>

pub fn sm89_tensor_cores(allow_tf32: bool) -> Vec<TensorCore>

pub fn rdna3_tensor_cores() -> Vec<TensorCore>

pub fn rdna4_tensor_cores() -> Vec<TensorCore>

pub fn cdna3_tensor_cores() -> Vec<TensorCore>

pub fn cdna4_tensor_cores() -> Vec<TensorCore>

pub fn metal_tensor_cores() -> Vec<TensorCore>

pub fn amx_tensor_cores() -> Vec<TensorCore>

pub fn intel_tensor_cores() -> Vec<TensorCore>

Trait Implementations§

impl Clone for TensorCore

fn clone(&self) -> TensorCore

fn clone_from(&mut self, source: &Self)

impl Debug for TensorCore

fn fmt(&self, f: &mut Formatter<'_>) -> Result

Auto Trait Implementations§

impl Freeze for TensorCore

impl RefUnwindSafe for TensorCore

impl Send for TensorCore

impl Sync for TensorCore

impl Unpin for TensorCore

impl UnsafeUnpin for TensorCore

impl UnwindSafe for TensorCore

Blanket Implementations§

impl<T> Any for Twhere T: 'static + ?Sized,

fn type_id(&self) -> TypeId

impl<T> Borrow<T> for Twhere T: ?Sized,

fn borrow(&self) -> &T

impl<T> BorrowMut<T> for Twhere T: ?Sized,

fn borrow_mut(&mut self) -> &mut T

impl<T> CloneToUninit for Twhere T: Clone,

unsafe fn clone_to_uninit(&self, dest: *mut u8)

impl<T> From<T> for T

fn from(t: T) -> T

impl<T> Instrument for T

fn instrument(self, span: Span) -> Instrumented<Self>

fn in_current_span(self) -> Instrumented<Self>

impl<T, U> Into<U> for Twhere U: From<T>,

fn into(self) -> U

impl<T> IntoEither for T

fn into_either(self, into_left: bool) -> Either<Self, Self>

fn into_either_with<F>(self, into_left: F) -> Either<Self, Self>where F: FnOnce(&Self) -> bool,

impl<T> Pointable for T

const ALIGN: usize

type Init = T

unsafe fn init(init: <T as Pointable>::Init) -> usize

unsafe fn deref<'a>(ptr: usize) -> &'a T

unsafe fn deref_mut<'a>(ptr: usize) -> &'a mut T

unsafe fn drop(ptr: usize)

impl<T> ToOwned for Twhere T: Clone,

type Owned = T

fn to_owned(&self) -> T

fn clone_into(&self, target: &mut T)

impl<T, U> TryFrom<U> for Twhere U: Into<T>,

type Error = Infallible

fn try_from(value: U) -> Result<T, <T as TryFrom<U>>::Error>

impl<T, U> TryInto<U> for Twhere U: TryFrom<T>,

type Error = <U as TryFrom<T>>::Error

fn try_into(self) -> Result<U, <U as TryFrom<T>>::Error>

impl<V, T> VZip<V> for Twhere V: MultiLane<T>,

fn vzip(self) -> V

impl<T> WithSubscriber for T

fn with_subscriber<S>(self, subscriber: S) -> WithDispatch<Self>where S: Into<Dispatch>,

fn with_current_subscriber(self) -> WithDispatch<Self>

Struct TensorCore

impl<T> Any for T
where T: 'static + ?Sized,

impl<T> Borrow<T> for T
where T: ?Sized,

impl<T> BorrowMut<T> for T
where T: ?Sized,

impl<T> CloneToUninit for T
where T: Clone,

impl<T, U> Into<U> for T
where U: From<T>,

fn into_either_with<F>(self, into_left: F) -> Either<Self, Self>
where F: FnOnce(&Self) -> bool,

impl<T> ToOwned for T
where T: Clone,

impl<T, U> TryFrom<U> for T
where U: Into<T>,

impl<T, U> TryInto<U> for T
where U: TryFrom<T>,

impl<V, T> VZip<V> for T
where V: MultiLane<T>,

fn with_subscriber<S>(self, subscriber: S) -> WithDispatch<Self>
where S: Into<Dispatch>,