entrenar/autograd/cuda_backward/mod.rs
1//! CUDA-accelerated backward kernels for autograd
2//!
3//! This module wraps trueno-gpu backward kernels for GPU-accelerated gradient computation.
4//! Provides 10-100x speedup over CPU ndarray implementations.
5//!
6//! # Safety
7//!
8//! This module uses unsafe code for CUDA kernel launching, which is inherently unsafe.
9//! The unsafe blocks are required for FFI calls to the CUDA driver API.
10//!
11//! # Architecture (SPEC-FT-001 v3.0.0)
12//!
13//! ```text
14//! entrenar autograd
15//! └── cuda_backward (this module)
16//! └── trueno-gpu/kernels/backward
17//! └── PTX generation + CUDA driver
18//! ```
19//!
20//! # Available Kernels
21//!
22//! - `relu_backward` - ReLU gradient: dL/dx = dL/dy * (x > 0)
23//! - `gelu_backward` - GELU gradient with tanh approximation
24//! - `silu_backward` - SiLU/Swish gradient
25//! - `softmax_backward` - Softmax Jacobian-vector product
26//! - `rms_norm_backward` - RMSNorm gradients for input and gamma
27//! - `layer_norm_backward` - LayerNorm gradients for input, gamma, beta
28//! - `gemm_backward_a` - Matrix multiply gradient w.r.t. A
29//! - `gemm_backward_b` - Matrix multiply gradient w.r.t. B
30
31mod cache;
32mod elementwise;
33mod gemm;
34mod structured;
35
36#[cfg(test)]
37mod tests;
38
39#[cfg(feature = "cuda")]
40pub(crate) use cache::set_backward_cublas_stream;
41pub use cache::{init_kernel_cache, pre_warm_lora_backward_kernels};
42pub use elementwise::{gelu_backward, relu_backward, silu_backward};
43pub use gemm::{
44 gemm_backward_a, gemm_backward_a_accumulate, gemm_backward_a_fp16_dispatch,
45 gemm_backward_a_fp16_dispatch_accumulate, gemm_backward_b,
46};
47pub use structured::{
48 batched_softmax_backward, layer_norm_backward, rms_norm_backward, rms_norm_forward,
49 softmax_backward,
50};