use crate::error::{RusTorchError, RusTorchResult};
#[cfg(feature = "cuda")]
use bytemuck::{Pod, Zeroable};
#[cfg(feature = "cuda")]
use cudarc::{
cublas::{CudaBlas, Gemm},
driver::{
CudaDevice, CudaFunction, CudaSlice, DeviceRepr, LaunchAsync, LaunchConfig, ValidAsZeroBits,
},
nvrtc::compile_ptx,
};
use std::sync::Arc;
pub enum CudaKernelType {
ElementWise,
MatMul,
Reduction,
Convolution,
BatchNorm,
}
pub struct CudaKernelParams {
pub grid_dim: (u32, u32, u32),
pub block_dim: (u32, u32, u32),
pub shared_mem_size: usize,
pub stream_id: usize,
}
impl Default for CudaKernelParams {
fn default() -> Self {
Self {
grid_dim: (1, 1, 1),
block_dim: (256, 1, 1),
shared_mem_size: 0,
stream_id: 0,
}
}
}
#[derive(Debug)]
pub struct CudaBuffer<T> {
#[cfg(feature = "cuda")]
pub slice: CudaSlice<T>,
#[cfg(not(feature = "cuda"))]
_phantom: std::marker::PhantomData<T>,
pub size: usize,
pub device_id: usize,
}
#[cfg(feature = "cuda")]
impl<T: Clone + ValidAsZeroBits + DeviceRepr + Unpin> CudaBuffer<T> {
pub fn new(_size: usize, _device_id: usize) -> RusTorchResult<Self> {
#[cfg(feature = "cuda")]
{
use cudarc::driver::CudaDevice;
let device = CudaDevice::new(_device_id).map_err(|e| {
RusTorchError::tensor_op(format!(
"Failed to initialize CUDA device {}: {}",
_device_id, e
))
})?;
let slice = device.alloc_zeros::<T>(_size).map_err(|e| {
RusTorchError::tensor_op(format!("Failed to allocate CUDA memory: {}", e))
})?;
Ok(Self {
slice,
size: _size,
device_id: _device_id,
})
}
#[cfg(not(feature = "cuda"))]
{
Err(RusTorchError::BackendUnavailable {
backend: "CUDA".to_string(),
})
}
}
pub fn copy_from_host(&mut self, host_data: &[T]) -> RusTorchResult<()> {
if host_data.len() != self.size {
return Err(RusTorchError::InvalidParameters {
operation: "copy_from_host".to_string(),
message: "Size mismatch in host-to-device copy".to_string(),
});
}
#[cfg(feature = "cuda")]
{
use cudarc::driver::CudaDevice;
let device = CudaDevice::new(self.device_id)
.map_err(|e| RusTorchError::gpu(format!("Failed to get CUDA device: {}", e)))?;
self.slice = device.htod_copy(host_data.to_vec()).map_err(|e| {
RusTorchError::tensor_op(format!("Host-to-device copy failed: {}", e))
})?;
Ok(())
}
#[cfg(not(feature = "cuda"))]
{
Err(RusTorchError::BackendUnavailable {
backend: "CUDA".to_string(),
})
}
}
pub fn copy_to_host(&self, host_data: &mut [T]) -> RusTorchResult<()> {
if host_data.len() != self.size {
return Err(RusTorchError::InvalidParameters {
operation: "copy_to_host".to_string(),
message: "Size mismatch in device-to-host copy".to_string(),
});
}
#[cfg(feature = "cuda")]
{
use cudarc::driver::CudaDevice;
let device = CudaDevice::new(self.device_id)
.map_err(|e| RusTorchError::gpu(format!("Failed to get CUDA device: {}", e)))?;
device
.dtoh_sync_copy_into(&self.slice, host_data)
.map_err(|e| {
RusTorchError::tensor_op(format!("Device-to-host copy failed: {}", e))
})?;
Ok(())
}
#[cfg(not(feature = "cuda"))]
{
Err(RusTorchError::BackendUnavailable {
backend: "CUDA".to_string(),
})
}
}
}
impl<T> Drop for CudaBuffer<T> {
fn drop(&mut self) {
#[cfg(feature = "cuda")]
{
if let Ok(device) = CudaDevice::new(self.device_id) {
let _ = device.synchronize();
}
}
}
}
#[cfg(feature = "cuda")]
pub struct CudaKernelExecutor {
device: Arc<CudaDevice>,
cublas: CudaBlas,
device_id: usize,
}
#[cfg(feature = "cuda")]
impl CudaKernelExecutor {
pub fn new(device_id: usize) -> RusTorchResult<Self> {
let device = CudaDevice::new(device_id).map_err(|e| {
RusTorchError::gpu(format!(
"Failed to initialize CUDA device {}: {}",
device_id, e
))
})?;
let cublas = CudaBlas::new(device.clone())
.map_err(|e| RusTorchError::gpu(format!("Failed to initialize cuBLAS: {}", e)))?;
Ok(Self {
device,
cublas,
device_id,
})
}
pub fn matmul_f32(
&self,
a: &[f32],
b: &[f32],
c: &mut [f32],
m: usize,
n: usize,
k: usize,
) -> RusTorchResult<()> {
let a_gpu = self
.device
.htod_copy(a.to_vec())
.map_err(|e| RusTorchError::gpu(format!("Failed to copy matrix A to device: {}", e)))?;
let b_gpu = self
.device
.htod_copy(b.to_vec())
.map_err(|e| RusTorchError::gpu(format!("Failed to copy matrix B to device: {}", e)))?;
let mut c_gpu = self
.device
.alloc_zeros::<f32>(m * n)
.map_err(|e| RusTorchError::gpu(format!("Failed to allocate result matrix: {}", e)))?;
use cudarc::cublas::GemmConfig;
let cfg = GemmConfig {
transa: cudarc::cublas::sys::cublasOperation_t::CUBLAS_OP_N,
transb: cudarc::cublas::sys::cublasOperation_t::CUBLAS_OP_N,
m: m as i32,
n: n as i32,
k: k as i32,
lda: k as i32,
ldb: n as i32,
ldc: n as i32,
alpha: 1.0f32,
beta: 0.0f32,
};
unsafe {
self.cublas
.gemm(cfg, &a_gpu, &b_gpu, &mut c_gpu)
.map_err(|e| RusTorchError::tensor_op(format!("cuBLAS GEMM failed: {}", e)))?;
}
self.device.dtoh_sync_copy_into(&c_gpu, c).map_err(|e| {
RusTorchError::tensor_op(format!("Failed to copy result to host: {}", e))
})?;
Ok(())
}
pub fn elementwise_add_f32(&self, a: &[f32], b: &[f32], c: &mut [f32]) -> RusTorchResult<()> {
let size = a.len();
if b.len() != size || c.len() != size {
return Err(RusTorchError::InvalidParameters {
operation: "elementwise_add_f32".to_string(),
message: "Array size mismatch in element-wise addition".to_string(),
});
}
let kernel_src = r#"
extern "C" __global__ void elementwise_add_f32(
const float* a,
const float* b,
float* c,
int n
) {
int idx = blockIdx.x * blockDim.x + threadIdx.x;
if (idx < n) {
c[idx] = a[idx] + b[idx];
}
}
"#;
let ptx = compile_ptx(kernel_src).map_err(|e| {
RusTorchError::kernel_compilation(format!("Failed to compile CUDA kernel: {}", e))
})?;
self.device
.load_ptx(ptx, "elementwise_add", &["elementwise_add_f32"])
.map_err(|e| RusTorchError::kernel_compilation(format!("Failed to load PTX: {}", e)))?;
let a_gpu = self
.device
.htod_copy(a.to_vec())
.map_err(|e| RusTorchError::gpu(format!("Failed to copy array A to device: {}", e)))?;
let b_gpu = self
.device
.htod_copy(b.to_vec())
.map_err(|e| RusTorchError::gpu(format!("Failed to copy array B to device: {}", e)))?;
let mut c_gpu = self
.device
.alloc_zeros::<f32>(size)
.map_err(|e| RusTorchError::gpu(format!("Failed to allocate result array: {}", e)))?;
let func = self
.device
.get_func("elementwise_add", "elementwise_add_f32")
.ok_or_else(|| {
RusTorchError::kernel_compilation(
"Failed to get kernel function: function not found".to_string(),
)
})?;
let block_size = 256;
let grid_size = size.div_ceil(block_size);
let config = LaunchConfig {
grid_dim: (grid_size as u32, 1, 1),
block_dim: (block_size as u32, 1, 1),
shared_mem_bytes: 0,
};
unsafe {
func.launch(config, (&a_gpu, &b_gpu, &mut c_gpu, size as i32))
.map_err(|e| RusTorchError::gpu(format!("Kernel launch failed: {}", e)))?;
}
self.device.dtoh_sync_copy_into(&c_gpu, c).map_err(|e| {
RusTorchError::tensor_op(format!("Failed to copy result to host: {}", e))
})?;
Ok(())
}
pub fn reduce_sum_f32(&self, input: &[f32]) -> RusTorchResult<f32> {
let size = input.len();
let kernel_src = r#"
extern "C" __global__ void reduce_sum_f32(
const float* input,
float* output,
int n
) {
extern __shared__ float sdata[];
int tid = threadIdx.x;
int i = blockIdx.x * blockDim.x + threadIdx.x;
sdata[tid] = (i < n) ? input[i] : 0.0f;
__syncthreads();
// Reduction in shared memory
for (int s = blockDim.x / 2; s > 0; s >>= 1) {
if (tid < s) {
sdata[tid] += sdata[tid + s];
}
__syncthreads();
}
// Write result for this block to global memory
if (tid == 0) output[blockIdx.x] = sdata[0];
}
"#;
let ptx = compile_ptx(kernel_src).map_err(|e| {
RusTorchError::kernel_compilation(format!("Failed to compile reduction kernel: {}", e))
})?;
self.device
.load_ptx(ptx, "reduce_sum", &["reduce_sum_f32"])
.map_err(|e| {
RusTorchError::kernel_compilation(format!("Failed to load reduction PTX: {}", e))
})?;
let input_gpu = self
.device
.htod_copy(input.to_vec())
.map_err(|e| RusTorchError::gpu(format!("Failed to copy input to device: {}", e)))?;
let block_size = 256;
let grid_size = size.div_ceil(block_size);
let mut output_gpu = self
.device
.alloc_zeros::<f32>(grid_size)
.map_err(|e| RusTorchError::gpu(format!("Failed to allocate output array: {}", e)))?;
let func = self
.device
.get_func("reduce_sum", "reduce_sum_f32")
.ok_or_else(|| {
RusTorchError::kernel_compilation(
"Failed to get reduction function: function not found".to_string(),
)
})?;
let config = LaunchConfig {
grid_dim: (grid_size as u32, 1, 1),
block_dim: (block_size as u32, 1, 1),
shared_mem_bytes: (block_size * std::mem::size_of::<f32>()) as u32,
};
unsafe {
func.launch(config, (&input_gpu, &mut output_gpu, size as i32))
.map_err(|e| {
RusTorchError::gpu(format!("Reduction kernel launch failed: {}", e))
})?;
}
let mut partial_results = vec![0.0f32; grid_size];
self.device
.dtoh_sync_copy_into(&output_gpu, &mut partial_results)
.map_err(|e| {
RusTorchError::tensor_op(format!("Failed to copy partial results: {}", e))
})?;
Ok(partial_results.iter().sum())
}
pub fn conv2d_f32(
&self,
input: &[f32],
kernel: &[f32],
output: &mut [f32],
input_height: usize,
input_width: usize,
input_channels: usize,
output_channels: usize,
kernel_height: usize,
kernel_width: usize,
stride_h: usize,
stride_w: usize,
pad_h: usize,
pad_w: usize,
) -> RusTorchResult<()> {
let output_height = (input_height + 2 * pad_h - kernel_height) / stride_h + 1;
let output_width = (input_width + 2 * pad_w - kernel_width) / stride_w + 1;
let expected_output_size = output_channels * output_height * output_width;
if output.len() != expected_output_size {
return Err(RusTorchError::InvalidParameters {
operation: "conv2d_f32".to_string(),
message: format!(
"Output size mismatch: expected {}, got {}",
expected_output_size,
output.len()
),
});
}
let kernel_src = r#"
extern "C" __global__ void conv2d_f32(
const float* input,
const float* kernel,
float* output,
int input_height,
int input_width,
int input_channels,
int output_channels,
int kernel_height,
int kernel_width,
int stride_h,
int stride_w,
int pad_h,
int pad_w,
int output_height,
int output_width
) {
int tid = blockIdx.x * blockDim.x + threadIdx.x;
int output_size = output_channels * output_height * output_width;
if (tid >= output_size) return;
int oc = tid / (output_height * output_width);
int oh = (tid % (output_height * output_width)) / output_width;
int ow = (tid % (output_height * output_width)) % output_width;
float sum = 0.0f;
for (int ic = 0; ic < input_channels; ic++) {
for (int kh = 0; kh < kernel_height; kh++) {
for (int kw = 0; kw < kernel_width; kw++) {
int ih = oh * stride_h - pad_h + kh;
int iw = ow * stride_w - pad_w + kw;
if (ih >= 0 && ih < input_height && iw >= 0 && iw < input_width) {
int input_idx = ic * input_height * input_width + ih * input_width + iw;
int kernel_idx = oc * input_channels * kernel_height * kernel_width +
ic * kernel_height * kernel_width + kh * kernel_width + kw;
sum += input[input_idx] * kernel[kernel_idx];
}
}
}
}
output[tid] = sum;
}
"#;
let ptx = compile_ptx(kernel_src).map_err(|e| {
RusTorchError::kernel_compilation(format!(
"Failed to compile CUDA conv2d kernel: {}",
e
))
})?;
self.device
.load_ptx(ptx, "conv2d", &["conv2d_f32"])
.map_err(|e| {
RusTorchError::kernel_compilation(format!("Failed to load conv2d PTX: {}", e))
})?;
let input_gpu = self
.device
.htod_copy(input.to_vec())
.map_err(|e| RusTorchError::gpu(format!("Failed to copy input to device: {}", e)))?;
let kernel_gpu = self
.device
.htod_copy(kernel.to_vec())
.map_err(|e| RusTorchError::gpu(format!("Failed to copy kernel to device: {}", e)))?;
let mut output_gpu = self
.device
.alloc_zeros::<f32>(expected_output_size)
.map_err(|e| RusTorchError::gpu(format!("Failed to allocate output: {}", e)))?;
let func = self
.device
.get_func("conv2d", "conv2d_f32")
.ok_or_else(|| {
RusTorchError::kernel_compilation(
"Failed to get conv2d function: function not found".to_string(),
)
})?;
let block_size = 256;
let grid_size = expected_output_size.div_ceil(block_size);
let config = LaunchConfig {
grid_dim: (grid_size as u32, 1, 1),
block_dim: (block_size as u32, 1, 1),
shared_mem_bytes: 0,
};
unsafe {
let params = (
&input_gpu,
&kernel_gpu,
&mut output_gpu,
input_height as i32,
input_width as i32,
input_channels as i32,
output_channels as i32,
kernel_height as i32,
kernel_width as i32,
stride_h as i32,
stride_w as i32,
pad_h as i32,
pad_w as i32,
output_height as i32,
output_width as i32,
);
func.launch(config, params)
.map_err(|e| RusTorchError::gpu(format!("Conv2d kernel launch failed: {}", e)))?;
}
self.device
.dtoh_sync_copy_into(&output_gpu, output)
.map_err(|e| {
RusTorchError::tensor_op(format!("Failed to copy conv2d result to host: {}", e))
})?;
Ok(())
}
}
#[cfg(not(feature = "cuda"))]
pub struct CudaKernelExecutor {
_device_id: usize,
}
#[cfg(not(feature = "cuda"))]
impl CudaKernelExecutor {
pub fn new(_device_id: usize) -> RusTorchResult<Self> {
Err(RusTorchError::UnsupportedDevice(
"CUDA not available".to_string(),
))
}
pub fn matmul_f32(
&self,
_a: &[f32],
_b: &[f32],
_c: &mut [f32],
_m: usize,
_n: usize,
_k: usize,
) -> RusTorchResult<()> {
Err(RusTorchError::UnsupportedDevice(
"CUDA not available".to_string(),
))
}
pub fn elementwise_add_f32(
&self,
_a: &[f32],
_b: &[f32],
_c: &mut [f32],
) -> RusTorchResult<()> {
Err(RusTorchError::UnsupportedDevice(
"CUDA not available".to_string(),
))
}
pub fn reduce_sum_f32(&self, _input: &[f32]) -> RusTorchResult<f32> {
Err(RusTorchError::UnsupportedDevice(
"CUDA not available".to_string(),
))
}
pub fn conv2d_f32(
&self,
_input: &[f32],
_kernel: &[f32],
_output: &mut [f32],
_input_height: usize,
_input_width: usize,
_input_channels: usize,
_output_channels: usize,
_kernel_height: usize,
_kernel_width: usize,
_stride_h: usize,
_stride_w: usize,
_pad_h: usize,
_pad_w: usize,
) -> RusTorchResult<()> {
Err(RusTorchError::UnsupportedDevice(
"CUDA not available".to_string(),
))
}
}
pub fn cuda_matmul_f32(
_a: &[f32],
_b: &[f32],
_c: &mut [f32],
_m: usize,
_n: usize,
_k: usize,
) -> RusTorchResult<()> {
#[cfg(feature = "cuda")]
{
let executor = CudaKernelExecutor::new(0)?;
executor.matmul_f32(_a, _b, _c, _m, _n, _k)
}
#[cfg(not(feature = "cuda"))]
{
Err(RusTorchError::UnsupportedDevice(
"CUDA not available".to_string(),
))
}
}
pub fn cuda_elementwise_add_f32(_a: &[f32], _b: &[f32], _c: &mut [f32]) -> RusTorchResult<()> {
#[cfg(feature = "cuda")]
{
let executor = CudaKernelExecutor::new(0)?;
executor.elementwise_add_f32(_a, _b, _c)
}
#[cfg(not(feature = "cuda"))]
{
Err(RusTorchError::UnsupportedDevice(
"CUDA not available".to_string(),
))
}
}
pub fn cuda_reduce_sum_f32(_input: &[f32]) -> RusTorchResult<f32> {
#[cfg(feature = "cuda")]
{
let executor = CudaKernelExecutor::new(0)?;
executor.reduce_sum_f32(_input)
}
#[cfg(not(feature = "cuda"))]
{
Err(RusTorchError::UnsupportedDevice(
"CUDA not available".to_string(),
))
}
}
pub fn cuda_conv2d_f32(
_input: &[f32],
_kernel: &[f32],
_output: &mut [f32],
_input_height: usize,
_input_width: usize,
_input_channels: usize,
_output_channels: usize,
_kernel_height: usize,
_kernel_width: usize,
_stride_h: usize,
_stride_w: usize,
_pad_h: usize,
_pad_w: usize,
) -> RusTorchResult<()> {
#[cfg(feature = "cuda")]
{
let executor = CudaKernelExecutor::new(0)?;
executor.conv2d_f32(
_input,
_kernel,
_output,
_input_height,
_input_width,
_input_channels,
_output_channels,
_kernel_height,
_kernel_width,
_stride_h,
_stride_w,
_pad_h,
_pad_w,
)
}
#[cfg(not(feature = "cuda"))]
{
Err(RusTorchError::UnsupportedDevice(
"CUDA not available".to_string(),
))
}
}
pub mod cuda_utils {
use super::*;
pub fn calculate_launch_config(size: usize, max_threads_per_block: usize) -> (usize, usize) {
let block_size = (max_threads_per_block).min(256);
let grid_size = size.div_ceil(block_size);
(grid_size, block_size)
}
pub fn get_device_properties(_device_id: usize) -> RusTorchResult<DeviceProperties> {
#[cfg(feature = "cuda")]
{
use cudarc::driver::CudaDevice;
let device = CudaDevice::new(0)
.map_err(|e| RusTorchError::gpu(format!("Failed to get device 0: {}", e)))?;
Ok(DeviceProperties {
name: "CUDA Device 0".to_string(),
compute_capability: (7, 5), max_threads_per_block: 1024,
max_shared_memory: 49152,
warp_size: 32,
memory_size: 8 * 1024 * 1024 * 1024usize, })
}
#[cfg(not(feature = "cuda"))]
{
Err(RusTorchError::BackendUnavailable {
backend: "CUDA".to_string(),
})
}
}
}
pub struct DeviceProperties {
pub name: String,
pub compute_capability: (u32, u32),
pub max_threads_per_block: usize,
pub max_shared_memory: usize,
pub warp_size: usize,
pub memory_size: usize,
}
#[cfg(test)]
mod tests {
use super::*;
#[test]
#[cfg(feature = "cuda")]
fn test_cuda_buffer_creation() {
let result = CudaBuffer::<f32>::new(1024, 0);
match result {
Ok(_) => println!("CUDA buffer created successfully"),
Err(_) => println!("CUDA buffer creation failed (expected if no GPU)"),
}
}
#[test]
#[cfg(not(feature = "cuda"))]
fn test_cuda_buffer_creation_no_cuda() {
assert!(true);
}
#[test]
fn test_cuda_kernel_params() {
let params = CudaKernelParams::default();
assert_eq!(params.grid_dim, (1, 1, 1));
assert_eq!(params.block_dim, (256, 1, 1));
assert_eq!(params.shared_mem_size, 0);
assert_eq!(params.stream_id, 0);
}
#[test]
fn test_cuda_utils() {
let (grid_size, block_size) = cuda_utils::calculate_launch_config(1000, 256);
assert_eq!(block_size, 256);
assert_eq!(grid_size, 4); }
}