Expand description
CUDA GPU backend for ferrotorch.
This crate provides device management, memory allocation, and host/device
data transfers built on cudarc. It is the bridge between ferrotorch’s
CPU tensor world and NVIDIA GPUs.
§Feature flags
| Feature | Default | Description |
|---|---|---|
cuda | yes | Links against the CUDA driver API via cudarc. Disable to compile on machines without a GPU. |
§Quick start
use ferrotorch_gpu::{GpuDevice, cpu_to_gpu, gpu_to_cpu};
let device = GpuDevice::new(0).unwrap();
let host_data = vec![1.0f32, 2.0, 3.0];
let gpu_buf = cpu_to_gpu(&host_data, &device).unwrap();
let back = gpu_to_cpu(&gpu_buf, &device).unwrap();
assert_eq!(back, host_data);Re-exports§
pub use allocator::CudaAllocator;pub use backend_impl::CudaBackendImpl;pub use backend_impl::get_cuda_device;pub use backend_impl::init_cuda_backend;pub use blas::gpu_bmm_f32;pub use blas::gpu_bmm_f32_into;pub use blas::gpu_matmul_f32_into;pub use blas::gpu_matmul_f32;pub use blas::gpu_matmul_f64;pub use buffer::CudaBuffer;pub use conv::gpu_conv2d_f32;pub use device::GpuDevice;pub use error::GpuError;pub use error::GpuResult;pub use graph::CaptureMode;pub use graph::CapturePool;pub use graph::CaptureStatus;pub use graph::CapturedGraph;pub use graph::GraphPoolHandle;pub use graph::begin_capture;pub use graph::capture_pool_for_handle;pub use graph::end_capture;pub use graph::end_capture_with_pool;pub use graph::graph_pool_handle;pub use graph::make_graphed_callable;pub use graph::release_graph_pool_handle;pub use graph::GraphCaptureGuard;pub use graph::begin_capture_with_mode;pub use graph::begin_capture_with_pool;pub use graph::capture_status;pub use graph::is_stream_capturing;pub use flash_attention::gpu_flash_attention_f32;pub use kernels::gpu_add;pub use kernels::gpu_mul;pub use kernels::gpu_neg;pub use kernels::gpu_relu;pub use kernels::gpu_sub;pub use kernels::gpu_add_into;pub use kernels::gpu_embed_lookup_into;pub use kernels::gpu_gelu_into;pub use kernels::gpu_layernorm_into;pub use kernels::gpu_mul_into;pub use kernels::gpu_permute_0213_into;pub use kernels::gpu_scale_into;pub use kernels::gpu_slice_read_into;pub use kernels::gpu_small_matmul_into;pub use kernels::gpu_softmax_into;pub use kernels::gpu_transpose_2d_into;pub use kernels::gpu_broadcast_add;pub use kernels::gpu_broadcast_mul;pub use kernels::gpu_broadcast_sub;pub use kernels::gpu_causal_mask_indirect;pub use kernels::gpu_slice_write_indirect;pub use kernels::gpu_dropout;pub use kernels::gpu_embed_lookup;pub use kernels::gpu_gelu;pub use kernels::gpu_layernorm;pub use kernels::gpu_permute_0213;pub use kernels::gpu_slice_read;pub use kernels::gpu_slice_write;pub use kernels::gpu_small_bmm;pub use kernels::gpu_small_matmul;pub use kernels::gpu_softmax;pub use kernels::gpu_transpose_2d;pub use memory_guard::MemoryGuard;pub use memory_guard::MemoryGuardBuilder;pub use memory_guard::MemoryGuardedDevice;pub use memory_guard::MemoryHook;pub use memory_guard::MemoryPressureListener;pub use memory_guard::MemoryReservation;pub use memory_guard::MemoryStats;pub use memory_guard::MemoryWatchdog;pub use memory_guard::OomPolicy;pub use memory_guard::PressureLevel;pub use pool::cached_bytes;pub use pool::empty_cache;pub use pool::empty_cache_all;pub use pool::round_len;pub use rng::CudaRngManager;pub use rng::PhiloxGenerator;pub use rng::PhiloxState;pub use rng::cuda_rng_manager;pub use rng::fork_rng;pub use rng::join_rng;pub use tensor_bridge::GpuFloat;pub use tensor_bridge::GpuTensor;pub use tensor_bridge::cuda;pub use tensor_bridge::cuda_default;pub use tensor_bridge::tensor_to_cpu;pub use tensor_bridge::tensor_to_gpu;pub use transfer::alloc_zeros;pub use transfer::alloc_zeros_f32;pub use transfer::alloc_zeros_f64;pub use transfer::cpu_to_gpu;pub use transfer::gpu_to_cpu;
Modules§
- allocator
- Caching CUDA memory allocator.
- backend_
impl - CUDA implementation of the
GpuBackendtrait from ferrotorch-core. - blas
- cuBLAS-backed GPU matrix multiplication.
- buffer
- GPU memory buffer with pool-aware Drop.
- conv
- GPU-accelerated 2-D convolution via im2col + cuBLAS GEMM.
- cusolver
- cuSOLVER-backed GPU linear algebra: SVD, Cholesky, QR, Solve.
- device
- CUDA device management.
- error
- flash_
attention - GPU-accelerated FlashAttention via a custom PTX kernel with shared memory.
- graph
- CUDA graph capture and replay infrastructure.
- kernels
- Custom PTX CUDA kernels for elementwise GPU operations.
- memory_
guard - GPU memory safety system — reservation, OOM recovery, pressure monitoring, and pre-OOM hooks.
- module_
cache - Global cache for compiled CUDA modules and kernel functions.
- pool
- GPU buffer pool — caching allocator for CUDA memory.
- rng
- CUDA RNG state management with Philox 4x32-10 counter-based generator.
- stream
- CUDA stream pool with thread-local current stream and event wrappers.
- tensor_
bridge - Bridge between ferrotorch-core
Tensor<T>and GPU operations. - transfer
- Host-to-device and device-to-host memory transfers.