pub struct CudaBackendImpl { /* private fields */ }Expand description
CUDA implementation of the GpuBackend trait.
Holds one or more GpuDevice handles (currently device 0 only) and
delegates every trait method to the corresponding function in
crate::kernels, crate::blas, or crate::transfer.
Implementations§
Source§impl CudaBackendImpl
impl CudaBackendImpl
Sourcepub fn new() -> FerrotorchResult<Self>
pub fn new() -> FerrotorchResult<Self>
Create a new CUDA backend, initializing device 0.
§Errors
Returns FerrotorchError::InvalidArgument if CUDA initialization fails
(e.g. no GPU available, driver not loaded).
Sourcepub fn default_device(&self) -> FerrotorchResult<&Arc<GpuDevice>>
pub fn default_device(&self) -> FerrotorchResult<&Arc<GpuDevice>>
Get the device for ordinal 0 (the default device).
Trait Implementations§
Source§impl GpuBackend for CudaBackendImpl
impl GpuBackend for CudaBackendImpl
Source§fn as_any(&self) -> &dyn Any
fn as_any(&self) -> &dyn Any
Downcast to
&dyn Any for backend-specific access (e.g., getting the
underlying GpuDevice for CUDA graph capture).fn cpu_to_gpu( &self, data: &[u8], elem_size: usize, device: usize, ) -> FerrotorchResult<GpuBufferHandle>
Source§fn cpu_to_gpu_pinned(
&self,
data: &[u8],
elem_size: usize,
device: usize,
) -> FerrotorchResult<GpuBufferHandle>
fn cpu_to_gpu_pinned( &self, data: &[u8], elem_size: usize, device: usize, ) -> FerrotorchResult<GpuBufferHandle>
Copy CPU data to GPU via pinned (page-locked) host memory. Read more
fn gpu_to_cpu(&self, handle: &GpuBufferHandle) -> FerrotorchResult<Vec<u8>>
fn clone_buffer( &self, handle: &GpuBufferHandle, ) -> FerrotorchResult<GpuBufferHandle>
fn alloc_zeros( &self, len: usize, elem_size: usize, device: usize, ) -> FerrotorchResult<GpuBufferHandle>
fn add_f32( &self, a: &GpuBufferHandle, b: &GpuBufferHandle, ) -> FerrotorchResult<GpuBufferHandle>
fn sub_f32( &self, a: &GpuBufferHandle, b: &GpuBufferHandle, ) -> FerrotorchResult<GpuBufferHandle>
fn mul_f32( &self, a: &GpuBufferHandle, b: &GpuBufferHandle, ) -> FerrotorchResult<GpuBufferHandle>
fn neg_f32(&self, a: &GpuBufferHandle) -> FerrotorchResult<GpuBufferHandle>
fn relu_f32(&self, a: &GpuBufferHandle) -> FerrotorchResult<GpuBufferHandle>
fn div_f32( &self, a: &GpuBufferHandle, b: &GpuBufferHandle, ) -> FerrotorchResult<GpuBufferHandle>
fn exp_f32(&self, a: &GpuBufferHandle) -> FerrotorchResult<GpuBufferHandle>
fn log_f32(&self, a: &GpuBufferHandle) -> FerrotorchResult<GpuBufferHandle>
fn sqrt_f32(&self, a: &GpuBufferHandle) -> FerrotorchResult<GpuBufferHandle>
fn pow_f32( &self, a: &GpuBufferHandle, exponent: f32, ) -> FerrotorchResult<GpuBufferHandle>
fn abs_f32(&self, a: &GpuBufferHandle) -> FerrotorchResult<GpuBufferHandle>
fn sigmoid_f32(&self, a: &GpuBufferHandle) -> FerrotorchResult<GpuBufferHandle>
fn tanh_f32(&self, a: &GpuBufferHandle) -> FerrotorchResult<GpuBufferHandle>
Source§fn fused_adam_f32(
&self,
param: &mut GpuBufferHandle,
grad: &GpuBufferHandle,
exp_avg: &mut GpuBufferHandle,
exp_avg_sq: &mut GpuBufferHandle,
beta1: f32,
beta2: f32,
lr: f32,
eps: f32,
bc1: f32,
bc2: f32,
weight_decay: f32,
) -> FerrotorchResult<()>
fn fused_adam_f32( &self, param: &mut GpuBufferHandle, grad: &GpuBufferHandle, exp_avg: &mut GpuBufferHandle, exp_avg_sq: &mut GpuBufferHandle, beta1: f32, beta2: f32, lr: f32, eps: f32, bc1: f32, bc2: f32, weight_decay: f32, ) -> FerrotorchResult<()>
Fused Adam optimizer step: updates param, exp_avg, and exp_avg_sq
in a single kernel launch. Read more
Source§fn maxpool2d_f32(
&self,
input: &GpuBufferHandle,
batch: usize,
channels: usize,
h_in: usize,
w_in: usize,
kh: usize,
kw: usize,
sh: usize,
sw: usize,
ph: usize,
pw: usize,
) -> FerrotorchResult<(GpuBufferHandle, [usize; 4])>
fn maxpool2d_f32( &self, input: &GpuBufferHandle, batch: usize, channels: usize, h_in: usize, w_in: usize, kh: usize, kw: usize, sh: usize, sw: usize, ph: usize, pw: usize, ) -> FerrotorchResult<(GpuBufferHandle, [usize; 4])>
GPU MaxPool2d forward.
Source§fn avgpool2d_f32(
&self,
input: &GpuBufferHandle,
batch: usize,
channels: usize,
h_in: usize,
w_in: usize,
kh: usize,
kw: usize,
sh: usize,
sw: usize,
ph: usize,
pw: usize,
) -> FerrotorchResult<(GpuBufferHandle, [usize; 4])>
fn avgpool2d_f32( &self, input: &GpuBufferHandle, batch: usize, channels: usize, h_in: usize, w_in: usize, kh: usize, kw: usize, sh: usize, sw: usize, ph: usize, pw: usize, ) -> FerrotorchResult<(GpuBufferHandle, [usize; 4])>
GPU AvgPool2d forward.
Source§fn conv2d_f32(
&self,
input: &GpuBufferHandle,
weight: &GpuBufferHandle,
bias: Option<&GpuBufferHandle>,
input_shape: [usize; 4],
weight_shape: [usize; 4],
stride: (usize, usize),
padding: (usize, usize),
) -> FerrotorchResult<(GpuBufferHandle, [usize; 4])>
fn conv2d_f32( &self, input: &GpuBufferHandle, weight: &GpuBufferHandle, bias: Option<&GpuBufferHandle>, input_shape: [usize; 4], weight_shape: [usize; 4], stride: (usize, usize), padding: (usize, usize), ) -> FerrotorchResult<(GpuBufferHandle, [usize; 4])>
GPU Conv2d forward: im2col + GEMM + bias add, entirely on-device. Read more
Source§fn fused_gru_cell_f32(
&self,
input_gates: &GpuBufferHandle,
hidden_gates: &GpuBufferHandle,
bias_ih: &GpuBufferHandle,
bias_hh: &GpuBufferHandle,
hx: &GpuBufferHandle,
hidden_size: usize,
) -> FerrotorchResult<(GpuBufferHandle, GpuBufferHandle)>
fn fused_gru_cell_f32( &self, input_gates: &GpuBufferHandle, hidden_gates: &GpuBufferHandle, bias_ih: &GpuBufferHandle, bias_hh: &GpuBufferHandle, hx: &GpuBufferHandle, hidden_size: usize, ) -> FerrotorchResult<(GpuBufferHandle, GpuBufferHandle)>
Fused GRU cell forward: pointwise gate computation on pre-computed
gate matrices. Returns
(hy_handle, workspace_handle). Read moreSource§fn synchronize(&self, device: usize) -> FerrotorchResult<()>
fn synchronize(&self, device: usize) -> FerrotorchResult<()>
Synchronize the current stream on the given device, blocking until
all enqueued operations have completed.
Source§fn stream_count(&self, device: usize) -> usize
fn stream_count(&self, device: usize) -> usize
Return the number of streams in the pool for the given device.
fn matmul_f32( &self, a: &GpuBufferHandle, b: &GpuBufferHandle, m: usize, k: usize, n: usize, ) -> FerrotorchResult<GpuBufferHandle>
fn sum_f32( &self, a: &GpuBufferHandle, _len: usize, ) -> FerrotorchResult<GpuBufferHandle>
fn matmul_f64( &self, a: &GpuBufferHandle, b: &GpuBufferHandle, m: usize, k: usize, n: usize, ) -> FerrotorchResult<GpuBufferHandle>
fn broadcast_add_f32( &self, a: &GpuBufferHandle, b: &GpuBufferHandle, a_shape: &[usize], b_shape: &[usize], out_shape: &[usize], ) -> FerrotorchResult<GpuBufferHandle>
fn broadcast_sub_f32( &self, a: &GpuBufferHandle, b: &GpuBufferHandle, a_shape: &[usize], b_shape: &[usize], out_shape: &[usize], ) -> FerrotorchResult<GpuBufferHandle>
fn broadcast_mul_f32( &self, a: &GpuBufferHandle, b: &GpuBufferHandle, a_shape: &[usize], b_shape: &[usize], out_shape: &[usize], ) -> FerrotorchResult<GpuBufferHandle>
fn broadcast_div_f32( &self, a: &GpuBufferHandle, b: &GpuBufferHandle, a_shape: &[usize], b_shape: &[usize], out_shape: &[usize], ) -> FerrotorchResult<GpuBufferHandle>
fn softmax_f32( &self, a: &GpuBufferHandle, rows: usize, cols: usize, ) -> FerrotorchResult<GpuBufferHandle>
fn dropout_f32( &self, a: &GpuBufferHandle, threshold: u32, scale: f32, seed: u32, ) -> FerrotorchResult<GpuBufferHandle>
Source§fn dropout_philox_f32(
&self,
a: &GpuBufferHandle,
threshold: u32,
scale: f32,
) -> FerrotorchResult<(GpuBufferHandle, GpuRngState)>
fn dropout_philox_f32( &self, a: &GpuBufferHandle, threshold: u32, scale: f32, ) -> FerrotorchResult<(GpuBufferHandle, GpuRngState)>
Dropout using the Philox CBRNG for deterministic, reproducible mask generation. Read more
fn transpose_2d_f32( &self, a: &GpuBufferHandle, m: usize, n: usize, ) -> FerrotorchResult<GpuBufferHandle>
fn permute_0213_f32( &self, a: &GpuBufferHandle, d0: usize, d1: usize, d2: usize, d3: usize, ) -> FerrotorchResult<GpuBufferHandle>
fn bmm_f32( &self, a: &GpuBufferHandle, b: &GpuBufferHandle, batch: usize, m: usize, k: usize, n: usize, ) -> FerrotorchResult<GpuBufferHandle>
Source§fn bmm_f16_f32(
&self,
a: &GpuBufferHandle,
b: &GpuBufferHandle,
batch: usize,
m: usize,
k: usize,
n: usize,
) -> FerrotorchResult<GpuBufferHandle>
fn bmm_f16_f32( &self, a: &GpuBufferHandle, b: &GpuBufferHandle, batch: usize, m: usize, k: usize, n: usize, ) -> FerrotorchResult<GpuBufferHandle>
Batched matmul with f16 Tensor Core acceleration.
Takes f32 handles, converts to f16 internally, accumulates in f32.
fn gelu_f32(&self, a: &GpuBufferHandle) -> FerrotorchResult<GpuBufferHandle>
fn layernorm_f32( &self, input: &GpuBufferHandle, weight: &GpuBufferHandle, bias: &GpuBufferHandle, rows: usize, cols: usize, eps: f32, ) -> FerrotorchResult<GpuBufferHandle>
fn slice_write_f32( &self, src: &GpuBufferHandle, dst: &mut GpuBufferHandle, n_batch: usize, d: usize, max_len: usize, pos: usize, ) -> FerrotorchResult<()>
fn slice_read_f32( &self, src: &GpuBufferHandle, n_batch: usize, d: usize, len: usize, max_len: usize, ) -> FerrotorchResult<GpuBufferHandle>
fn embed_lookup_f32( &self, idx: &GpuBufferHandle, weight: &GpuBufferHandle, d: usize, ) -> FerrotorchResult<GpuBufferHandle>
fn embed_lookup_batch_f32( &self, indices: &GpuBufferHandle, weight: &GpuBufferHandle, n: usize, d: usize, ) -> FerrotorchResult<GpuBufferHandle>
fn scatter_add_rows_f32( &self, grad_output: &GpuBufferHandle, indices: &GpuBufferHandle, num_embeddings: usize, d: usize, ) -> FerrotorchResult<GpuBufferHandle>
fn scale_f32( &self, a: &GpuBufferHandle, scalar: f32, ) -> FerrotorchResult<GpuBufferHandle>
fn relu_backward_f32( &self, grad: &GpuBufferHandle, input: &GpuBufferHandle, ) -> FerrotorchResult<GpuBufferHandle>
fn gelu_backward_f32( &self, grad: &GpuBufferHandle, input: &GpuBufferHandle, ) -> FerrotorchResult<GpuBufferHandle>
fn index_select_1d_f32( &self, input: &GpuBufferHandle, indices: &GpuBufferHandle, ) -> FerrotorchResult<GpuBufferHandle>
fn scatter_add_1d_f32( &self, grad_output: &GpuBufferHandle, indices: &GpuBufferHandle, input_len: usize, ) -> FerrotorchResult<GpuBufferHandle>
fn masked_fill_f32( &self, input: &GpuBufferHandle, mask: &GpuBufferHandle, value: f32, ) -> FerrotorchResult<GpuBufferHandle>
fn masked_zero_f32( &self, grad: &GpuBufferHandle, mask: &GpuBufferHandle, ) -> FerrotorchResult<GpuBufferHandle>
fn sigmoid_backward_f32( &self, grad: &GpuBufferHandle, output: &GpuBufferHandle, ) -> FerrotorchResult<GpuBufferHandle>
fn tanh_backward_f32( &self, grad: &GpuBufferHandle, output: &GpuBufferHandle, ) -> FerrotorchResult<GpuBufferHandle>
fn softmax_backward_f32( &self, grad: &GpuBufferHandle, output: &GpuBufferHandle, cols: usize, ) -> FerrotorchResult<GpuBufferHandle>
fn layernorm_backward_f32( &self, input: &GpuBufferHandle, grad_output: &GpuBufferHandle, weight: &GpuBufferHandle, rows: usize, cols: usize, eps: f32, ) -> FerrotorchResult<(GpuBufferHandle, GpuBufferHandle, GpuBufferHandle)>
fn sum_axis_f32( &self, a: &GpuBufferHandle, shape: &[usize], axis: usize, ) -> FerrotorchResult<GpuBufferHandle>
Source§fn matmul_f16_f32(
&self,
a: &GpuBufferHandle,
b: &GpuBufferHandle,
m: usize,
k: usize,
n: usize,
) -> FerrotorchResult<GpuBufferHandle>
fn matmul_f16_f32( &self, a: &GpuBufferHandle, b: &GpuBufferHandle, m: usize, k: usize, n: usize, ) -> FerrotorchResult<GpuBufferHandle>
Mixed-precision matmul: cast f32 inputs to f16, multiply, accumulate
back to f32. Used by autocast when the category is
ReducedPrecision. Read moreSource§fn save_rng_state(&self, device: usize) -> FerrotorchResult<GpuRngState>
fn save_rng_state(&self, device: usize) -> FerrotorchResult<GpuRngState>
Save the current GPU RNG state for a device. Used by checkpoint to
ensure dropout masks are identical on recomputation.
Source§fn restore_rng_state(&self, state: GpuRngState) -> FerrotorchResult<()>
fn restore_rng_state(&self, state: GpuRngState) -> FerrotorchResult<()>
Restore a previously saved GPU RNG state for a device.
fn strided_split_f32( &self, input: &GpuBufferHandle, total_along_axis: usize, split_offset: usize, split_size: usize, inner_size: usize, n: usize, ) -> FerrotorchResult<GpuBufferHandle>
fn strided_cat_f32( &self, input: &GpuBufferHandle, output: &mut GpuBufferHandle, total_along_axis: usize, cat_offset: usize, part_size: usize, inner_size: usize, n: usize, ) -> FerrotorchResult<()>
fn add_f64( &self, _a: &GpuBufferHandle, _b: &GpuBufferHandle, ) -> Result<GpuBufferHandle, FerrotorchError>
fn sub_f64( &self, _a: &GpuBufferHandle, _b: &GpuBufferHandle, ) -> Result<GpuBufferHandle, FerrotorchError>
fn mul_f64( &self, _a: &GpuBufferHandle, _b: &GpuBufferHandle, ) -> Result<GpuBufferHandle, FerrotorchError>
fn neg_f64( &self, _a: &GpuBufferHandle, ) -> Result<GpuBufferHandle, FerrotorchError>
fn relu_f64( &self, _a: &GpuBufferHandle, ) -> Result<GpuBufferHandle, FerrotorchError>
fn sum_f64( &self, _a: &GpuBufferHandle, _numel: usize, ) -> Result<GpuBufferHandle, FerrotorchError>
Source§fn has_inf_nan_f32(&self, a: &GpuBufferHandle) -> Result<bool, FerrotorchError>
fn has_inf_nan_f32(&self, a: &GpuBufferHandle) -> Result<bool, FerrotorchError>
Check if a GPU buffer contains any inf or NaN values.
fn svd_f32( &self, _a: &GpuBufferHandle, _m: usize, _n: usize, ) -> Result<(GpuBufferHandle, GpuBufferHandle, GpuBufferHandle), FerrotorchError>
fn cholesky_f32( &self, _a: &GpuBufferHandle, _n: usize, ) -> Result<GpuBufferHandle, FerrotorchError>
fn solve_f32( &self, _a: &GpuBufferHandle, _b: &GpuBufferHandle, _n: usize, _nrhs: usize, ) -> Result<GpuBufferHandle, FerrotorchError>
fn qr_f32( &self, _a: &GpuBufferHandle, _m: usize, _n: usize, ) -> Result<(GpuBufferHandle, GpuBufferHandle), FerrotorchError>
Auto Trait Implementations§
impl Freeze for CudaBackendImpl
impl RefUnwindSafe for CudaBackendImpl
impl Send for CudaBackendImpl
impl Sync for CudaBackendImpl
impl Unpin for CudaBackendImpl
impl UnsafeUnpin for CudaBackendImpl
impl UnwindSafe for CudaBackendImpl
Blanket Implementations§
Source§impl<T> BorrowMut<T> for Twhere
T: ?Sized,
impl<T> BorrowMut<T> for Twhere
T: ?Sized,
Source§fn borrow_mut(&mut self) -> &mut T
fn borrow_mut(&mut self) -> &mut T
Mutably borrows from an owned value. Read more
Source§impl<T> DistributionExt for Twhere
T: ?Sized,
impl<T> DistributionExt for Twhere
T: ?Sized,
Source§impl<T> IntoEither for T
impl<T> IntoEither for T
Source§fn into_either(self, into_left: bool) -> Either<Self, Self>
fn into_either(self, into_left: bool) -> Either<Self, Self>
Converts
self into a Left variant of Either<Self, Self>
if into_left is true.
Converts self into a Right variant of Either<Self, Self>
otherwise. Read moreSource§fn into_either_with<F>(self, into_left: F) -> Either<Self, Self>
fn into_either_with<F>(self, into_left: F) -> Either<Self, Self>
Converts
self into a Left variant of Either<Self, Self>
if into_left(&self) returns true.
Converts self into a Right variant of Either<Self, Self>
otherwise. Read more