Skip to main content

ferrotorch_gpu/
lib.rs

1//! CUDA GPU backend for ferrotorch.
2//!
3//! This crate provides device management, memory allocation, and host/device
4//! data transfers built on [`cudarc`]. It is the bridge between ferrotorch's
5//! CPU tensor world and NVIDIA GPUs.
6//!
7//! # Feature flags
8//!
9//! | Feature | Default | Description |
10//! |---------|---------|-------------|
11//! | `cuda`  | **yes** | Links against the CUDA driver API via cudarc. Disable to compile on machines without a GPU. |
12//!
13//! # Quick start
14//!
15//! ```rust,no_run
16//! use ferrotorch_gpu::{GpuDevice, cpu_to_gpu, gpu_to_cpu};
17//!
18//! let device = GpuDevice::new(0).unwrap();
19//! let host_data = vec![1.0f32, 2.0, 3.0];
20//! let gpu_buf = cpu_to_gpu(&host_data, &device).unwrap();
21//! let back = gpu_to_cpu(&gpu_buf, &device).unwrap();
22//! assert_eq!(back, host_data);
23//! ```
24
25pub mod allocator;
26pub mod backend_impl;
27pub mod blas;
28pub mod buffer;
29pub mod conv;
30pub mod cusolver;
31pub mod device;
32pub mod error;
33pub mod flash_attention;
34pub mod graph;
35pub mod kernels;
36pub mod memory_guard;
37pub mod module_cache;
38pub mod pool;
39pub mod rng;
40pub mod stream;
41pub mod tensor_bridge;
42pub mod transfer;
43
44// Re-exports for ergonomic use.
45pub use allocator::CudaAllocator;
46pub use backend_impl::{CudaBackendImpl, get_cuda_device, init_cuda_backend};
47pub use blas::gpu_bmm_f32;
48pub use blas::{gpu_bmm_f32_into, gpu_matmul_f32_into};
49pub use blas::{gpu_matmul_f32, gpu_matmul_f64};
50pub use buffer::CudaBuffer;
51pub use conv::gpu_conv2d_f32;
52pub use device::GpuDevice;
53pub use error::{GpuError, GpuResult};
54pub use graph::{
55    CaptureMode, CapturePool, CaptureStatus, CapturedGraph, GraphPoolHandle,
56    begin_capture, capture_pool_for_handle, end_capture, end_capture_with_pool, graph_pool_handle,
57    make_graphed_callable, release_graph_pool_handle,
58};
59#[cfg(feature = "cuda")]
60pub use graph::{
61    GraphCaptureGuard, begin_capture_with_mode, begin_capture_with_pool, capture_status,
62    is_stream_capturing,
63};
64pub use flash_attention::gpu_flash_attention_f32;
65pub use kernels::{gpu_add, gpu_mul, gpu_neg, gpu_relu, gpu_sub};
66pub use kernels::{
67    gpu_add_into, gpu_embed_lookup_into, gpu_gelu_into, gpu_layernorm_into, gpu_mul_into,
68    gpu_permute_0213_into, gpu_scale_into, gpu_slice_read_into, gpu_small_matmul_into,
69    gpu_softmax_into, gpu_transpose_2d_into,
70};
71pub use kernels::{gpu_broadcast_add, gpu_broadcast_mul, gpu_broadcast_sub};
72pub use kernels::{gpu_causal_mask_indirect, gpu_slice_write_indirect};
73pub use kernels::{
74    gpu_dropout, gpu_embed_lookup, gpu_gelu, gpu_layernorm, gpu_permute_0213, gpu_slice_read,
75    gpu_slice_write, gpu_small_bmm, gpu_small_matmul, gpu_softmax, gpu_transpose_2d,
76};
77pub use memory_guard::{
78    MemoryGuard, MemoryGuardBuilder, MemoryGuardedDevice, MemoryHook, MemoryPressureListener,
79    MemoryReservation, MemoryStats, MemoryWatchdog, OomPolicy, PressureLevel,
80};
81pub use pool::{cached_bytes, empty_cache, empty_cache_all, round_len};
82pub use rng::{CudaRngManager, PhiloxGenerator, PhiloxState, cuda_rng_manager, fork_rng, join_rng};
83pub use tensor_bridge::{GpuFloat, GpuTensor, cuda, cuda_default, tensor_to_cpu, tensor_to_gpu};
84pub use transfer::{alloc_zeros, alloc_zeros_f32, alloc_zeros_f64, cpu_to_gpu, gpu_to_cpu};