use super::common::*;
use std::ptr;
#[cfg(feature = "cuda")]
pub mod cuda_impl {
use super::*;
type CudaResult = i32;
type CudaDevice = i32;
type CudaDeviceProperties = [u8; 352]; type CudaStream = *mut std::ffi::c_void;
type CudaEvent = *mut std::ffi::c_void;
type CublasHandle = *mut std::ffi::c_void;
type CusolverDnHandle = *mut std::ffi::c_void;
const CUDA_SUCCESS: CudaResult = 0;
const CUDA_ERROR_NO_DEVICE: CudaResult = 38;
fn cuda_get_device_count() -> (CudaResult, i32) {
(CUDA_SUCCESS, 0) }
fn cuda_get_device_properties(
_props: &mut CudaDeviceProperties,
_device: CudaDevice,
) -> CudaResult {
CUDA_SUCCESS
}
fn cuda_set_device(_device: CudaDevice) -> CudaResult {
CUDA_SUCCESS
}
fn cuda_device_synchronize() -> CudaResult {
CUDA_SUCCESS
}
fn cuda_malloc(_ptr: *mut *mut std::ffi::c_void, _size: usize) -> CudaResult {
CUDA_SUCCESS
}
fn cuda_free(_ptr: *mut std::ffi::c_void) -> CudaResult {
CUDA_SUCCESS
}
fn cuda_memcpy(
_dst: *mut std::ffi::c_void,
_src: *const std::ffi::c_void,
_count: usize,
_kind: i32,
) -> CudaResult {
CUDA_SUCCESS
}
fn cuda_mem_get_info() -> (CudaResult, usize, usize) {
#[cfg(target_pointer_width = "32")]
let (free, total) = (512 * 1024 * 1024, 1024 * 1024 * 1024); #[cfg(target_pointer_width = "64")]
let (free, total) = (8usize * 1024 * 1024 * 1024, 16usize * 1024 * 1024 * 1024);
(CUDA_SUCCESS, free, total)
}
pub struct CudaBackend {
initialized: bool,
devices: Vec<CudaDeviceInfo>,
driver_version: u32,
runtime_version: u32,
}
#[derive(Debug, Clone)]
struct CudaDeviceInfo {
device_id: i32,
properties: CudaDeviceProperties,
compute_capability: (i32, i32),
max_threads_per_block: i32,
max_block_dim: [i32; 3],
max_grid_dim: [i32; 3],
shared_memory_per_block: usize,
total_constant_memory: usize,
warpsize: i32,
max_pitch: usize,
max_registers_per_block: i32,
clock_rate: i32,
texture_alignment: usize,
concurrent_kernels: bool,
integrated: bool,
can_map_host_memory: bool,
compute_mode: i32,
maxtexture_1d: i32,
maxtexture_2d: [i32; 2],
maxtexture_3d: [i32; 3],
pci_bus_id: String,
pci_device_id: String,
unified_addressing: bool,
memory_clock_rate: i32,
memory_bus_width: i32,
l2_cachesize: usize,
max_threads_per_multiprocessor: i32,
stream_priorities_supported: bool,
global_l1_cache_supported: bool,
local_l1_cache_supported: bool,
managed_memory: bool,
multi_gpu_board: bool,
multi_gpu_board_group_id: i32,
}
impl CudaBackend {
pub fn new() -> LinalgResult<Self> {
let (result, device_count) = cuda_get_device_count();
if result != CUDA_SUCCESS {
if result == CUDA_ERROR_NO_DEVICE {
return Err(LinalgError::ComputationError(
"No CUDA-capable devices found".to_string(),
));
}
return Err(LinalgError::ComputationError(format!(
"Failed to initialize CUDA runtime: error code {}",
result
)));
}
let mut devices = Vec::with_capacity(device_count as usize);
for device_id in 0..device_count {
let mut properties = [0u8; 352];
let result = cuda_get_device_properties(&mut properties, device_id);
if result != CUDA_SUCCESS {
return Err(LinalgError::ComputationError(format!(
"Failed to get device properties for device {}: error code {}",
device_id, result
)));
}
let device_info = CudaDeviceInfo {
device_id,
properties,
compute_capability: (7, 5), max_threads_per_block: 1024,
max_block_dim: [1024, 1024, 64],
max_grid_dim: [2147483647, 65535, 65535],
shared_memory_per_block: 49152,
total_constant_memory: 65536,
warpsize: 32,
max_pitch: 2147483647,
max_registers_per_block: 65536,
clock_rate: 1590000, texture_alignment: 512,
concurrent_kernels: true,
integrated: false,
can_map_host_memory: true,
compute_mode: 0, maxtexture_1d: 131072,
maxtexture_2d: [131072, 65536],
maxtexture_3d: [16384, 16384, 16384],
pci_bus_id: format!("0000:{:02x}:00.0", device_id),
pci_device_id: "10de:1b80".to_string(), unified_addressing: true,
memory_clock_rate: 7000000, memory_bus_width: 352,
l2_cachesize: 5767168, max_threads_per_multiprocessor: 1024,
stream_priorities_supported: true,
global_l1_cache_supported: true,
local_l1_cache_supported: true,
managed_memory: true,
multi_gpu_board: false,
multi_gpu_board_group_id: 0,
};
devices.push(device_info);
}
Ok(Self {
initialized: true,
devices,
driver_version: 47_057, runtime_version: 114, })
}
pub fn driver_version(&self) -> u32 {
self.driver_version
}
pub fn runtime_version(&self) -> u32 {
self.runtime_version
}
pub fn supports_unified_memory(&self) -> bool {
self.devices.iter().all(|d| d.managed_memory)
}
pub fn max_compute_capability(&self) -> (i32, i32) {
self.devices
.iter()
.map(|d| d.compute_capability)
.max_by_key(|&(major, minor)| major * 10 + minor)
.unwrap_or((0, 0))
}
}
impl GpuBackend for CudaBackend {
fn name(&self) -> &str {
"CUDA"
}
fn is_available(&self) -> bool {
self.initialized && !self.devices.is_empty()
}
fn list_devices(&self) -> LinalgResult<Vec<GpuDeviceInfo>> {
if !self.initialized {
return Err(LinalgError::ComputationError(
"CUDA backend not initialized".to_string(),
));
}
let devices = self
.devices
.iter()
.map(|cuda_device| {
let memory_bandwidth = (cuda_device.memory_clock_rate as f64
* 2.0
* cuda_device.memory_bus_width as f64)
/ 8.0
/ 1_000_000.0;
GpuDeviceInfo {
device_type: GpuDeviceType::Cuda,
name: format!(
"CUDA Device {} (Compute {}.{})",
cuda_device.device_id,
cuda_device.compute_capability.0,
cuda_device.compute_capability.1
),
total_memory: {
#[cfg(target_pointer_width = "32")]
{
512 * 1024 * 1024
} #[cfg(target_pointer_width = "64")]
{
11usize * 1024 * 1024 * 1024
} },
compute_units: 68, clock_frequency: (cuda_device.clock_rate / 1000) as u32, supports_fp64: cuda_device.compute_capability.0 >= 2, supports_fp16: cuda_device.compute_capability.0 >= 5
|| (cuda_device.compute_capability.0 == 5
&& cuda_device.compute_capability.1 >= 3), max_work_groupsize: cuda_device.max_threads_per_block as usize,
memory_bandwidth,
l2_cachesize: cuda_device.l2_cachesize,
shared_memory_per_block: cuda_device.shared_memory_per_block,
registers_per_block: cuda_device.max_registers_per_block as u32,
warpsize: cuda_device.warpsize as u32,
max_threads_per_mp: cuda_device.max_threads_per_multiprocessor as u32,
multiprocessor_count: 68, supports_tensor_cores: cuda_device.compute_capability.0 >= 7, supports_mixed_precision: cuda_device.compute_capability.0 >= 5, vendor: "NVIDIA".to_string(),
}
})
.collect();
Ok(devices)
}
fn create_context(&self, device_id: usize) -> LinalgResult<Box<dyn GpuContext>> {
if !self.initialized {
return Err(LinalgError::ComputationError(
"CUDA backend not initialized".to_string(),
));
}
if device_id >= self.devices.len() {
return Err(LinalgError::ComputationError(format!(
"Invalid device ID: {} (available devices: {})",
device_id,
self.devices.len()
)));
}
let cuda_device = &self.devices[device_id];
let result = cuda_set_device(cuda_device.device_id);
if result != CUDA_SUCCESS {
return Err(LinalgError::ComputationError(format!(
"Failed to set CUDA device {}: error code {}",
device_id, result
)));
}
let context = CudaContext::new(cuda_device.clone())?;
Ok(Box::new(context))
}
}
#[derive(Debug)]
pub struct CudaContext {
device_info: CudaDeviceInfo,
device_id: i32,
cublas_handle: Option<CublasHandle>,
cusolver_handle: Option<CusolverDnHandle>,
streams: Vec<CudaStream>,
events: Vec<CudaEvent>,
memory_pool: CudaMemoryPool,
performance_stats: CudaPerformanceStats,
}
unsafe impl Send for CudaContext {}
unsafe impl Sync for CudaContext {}
impl CudaContext {
fn new(device_info: CudaDeviceInfo) -> LinalgResult<Self> {
let device_id = device_info.device_id;
let cublas_handle = None;
let cusolver_handle = None;
let streams = Vec::new();
let events = Vec::new();
let memory_pool = CudaMemoryPool::new(device_id)?;
let performance_stats = CudaPerformanceStats::new();
Ok(Self {
device_info,
device_id,
cublas_handle,
cusolver_handle,
streams,
events,
memory_pool,
performance_stats,
})
}
pub fn cublas_handle(&self) -> Option<CublasHandle> {
self.cublas_handle
}
pub fn cusolver_handle(&self) -> Option<CusolverDnHandle> {
self.cusolver_handle
}
pub fn create_stream(&mut self) -> LinalgResult<CudaStream> {
let stream = ptr::null_mut();
self.streams.push(stream);
Ok(stream)
}
pub fn performance_stats(&self) -> &CudaPerformanceStats {
&self.performance_stats
}
}
impl GpuContext for CudaContext {
#[allow(static_mut_refs)]
fn device_info(&self) -> &GpuDeviceInfo {
static mut CACHED_INFO: Option<GpuDeviceInfo> = None;
unsafe {
if CACHED_INFO.is_none() {
let memory_bandwidth = (self.device_info.memory_clock_rate as f64
* 2.0
* self.device_info.memory_bus_width as f64)
/ 8.0
/ 1_000_000.0;
CACHED_INFO = Some(GpuDeviceInfo {
device_type: GpuDeviceType::Cuda,
name: format!(
"CUDA Device {} (Compute {}.{})",
self.device_info.device_id,
self.device_info.compute_capability.0,
self.device_info.compute_capability.1
),
total_memory: {
#[cfg(target_pointer_width = "32")]
{
512 * 1024 * 1024
} #[cfg(target_pointer_width = "64")]
{
11usize * 1024 * 1024 * 1024
} },
compute_units: 68,
clock_frequency: (self.device_info.clock_rate / 1000) as u32,
supports_fp64: self.device_info.compute_capability.0 >= 2,
supports_fp16: self.device_info.compute_capability.0 >= 5
|| (self.device_info.compute_capability.0 == 5
&& self.device_info.compute_capability.1 >= 3),
max_work_groupsize: self.device_info.max_threads_per_block as usize,
memory_bandwidth,
l2_cachesize: self.device_info.l2_cachesize,
shared_memory_per_block: self.device_info.shared_memory_per_block,
registers_per_block: self.device_info.max_registers_per_block as u32,
warpsize: self.device_info.warpsize as u32,
max_threads_per_mp: self.device_info.max_threads_per_multiprocessor as u32,
multiprocessor_count: 68,
supports_tensor_cores: self.device_info.compute_capability.0 >= 7,
supports_mixed_precision: self.device_info.compute_capability.0 >= 5,
vendor: "NVIDIA".to_string(),
});
}
CACHED_INFO.as_ref().expect("Operation failed")
}
}
fn synchronize(&self) -> LinalgResult<()> {
let result = cuda_device_synchronize();
if result != CUDA_SUCCESS {
return Err(LinalgError::ComputationError(format!(
"CUDA synchronization failed: error code {}",
result
)));
}
Ok(())
}
fn available_memory(&self) -> LinalgResult<usize> {
let (result, free_mem, _total_mem) = cuda_mem_get_info();
if result != CUDA_SUCCESS {
return Err(LinalgError::ComputationError(format!(
"Failed to get memory info: error code {}",
result
)));
}
Ok(free_mem)
}
}
impl GpuContextAlloc for CudaContext {
fn allocate_buffer<T: Clone + Send + Sync + Copy + 'static + std::fmt::Debug>(
&self,
size: usize,
) -> LinalgResult<Box<dyn GpuBuffer<T>>> {
let buffer = CudaBuffer::new(size, self.device_id)?;
Ok(Box::new(buffer))
}
}
#[derive(Debug)]
struct CudaMemoryPool {
device_id: i32,
total_allocated: usize,
peak_usage: usize,
allocation_count: usize,
free_blocks: HashMap<usize, Vec<*mut std::ffi::c_void>>,
}
impl CudaMemoryPool {
fn new(device_id: i32) -> LinalgResult<Self> {
Ok(Self {
device_id,
total_allocated: 0,
peak_usage: 0,
allocation_count: 0,
free_blocks: HashMap::new(),
})
}
#[allow(dead_code)]
fn allocate(&mut self, size: usize) -> LinalgResult<*mut std::ffi::c_void> {
if let Some(blocks) = self.free_blocks.get_mut(&size) {
if let Some(ptr) = blocks.pop() {
return Ok(ptr);
}
}
let mut ptr = ptr::null_mut();
let result = cuda_malloc(&mut ptr, size);
if result != CUDA_SUCCESS {
return Err(LinalgError::ComputationError(format!(
"CUDA memory allocation failed: error code {}",
result
)));
}
self.total_allocated += size;
self.peak_usage = self.peak_usage.max(self.total_allocated);
self.allocation_count += 1;
Ok(ptr)
}
#[allow(dead_code)]
fn deallocate(&mut self, ptr: *mut std::ffi::c_void, size: usize) {
self.free_blocks.entry(size).or_default().push(ptr);
self.total_allocated = self.total_allocated.saturating_sub(size);
}
}
#[derive(Debug)]
struct CudaBuffer<T> {
device_ptr: *mut std::ffi::c_void,
size: usize,
device_id: i32,
is_pinned: bool,
_phantom: std::marker::PhantomData<T>,
}
unsafe impl<T> Send for CudaBuffer<T> {}
unsafe impl<T> Sync for CudaBuffer<T> {}
impl<T: Clone + Send + Sync + Copy> CudaBuffer<T> {
fn new(size: usize, device_id: i32) -> LinalgResult<Self> {
let bytesize = size * std::mem::size_of::<T>();
let mut device_ptr = ptr::null_mut();
let result = cuda_malloc(&mut device_ptr, bytesize);
if result != CUDA_SUCCESS {
return Err(LinalgError::ComputationError(format!(
"Failed to allocate CUDA buffer: error code {}",
result
)));
}
Ok(Self {
device_ptr,
size,
device_id,
is_pinned: false,
_phantom: std::marker::PhantomData,
})
}
#[allow(dead_code)]
pub fn enable_pinned_memory(&mut self) -> LinalgResult<()> {
self.is_pinned = true;
Ok(())
}
pub fn device_ptr_typed(&self) -> *mut T {
self.device_ptr as *mut T
}
}
impl<T: Clone + Send + Sync + Copy + std::fmt::Debug> GpuBuffer<T> for CudaBuffer<T> {
fn len(&self) -> usize {
self.size
}
fn copy_from_host(&mut self, data: &[T]) -> LinalgResult<()> {
if data.len() != self.size {
return Err(LinalgError::ShapeError(format!(
"Buffer size mismatch: expected {}, got {}",
self.size,
data.len()
)));
}
let bytesize = data.len() * std::mem::size_of::<T>();
let result = cuda_memcpy(
self.device_ptr,
data.as_ptr() as *const std::ffi::c_void,
bytesize,
1, );
if result != CUDA_SUCCESS {
return Err(LinalgError::ComputationError(format!(
"CUDA host-to-device copy failed: error code {}",
result
)));
}
Ok(())
}
fn copy_to_host(&self, data: &mut [T]) -> LinalgResult<()> {
if data.len() != self.size {
return Err(LinalgError::ShapeError(format!(
"Buffer size mismatch: expected {}, got {}",
self.size,
data.len()
)));
}
let bytesize = data.len() * std::mem::size_of::<T>();
let result = cuda_memcpy(
data.as_mut_ptr() as *mut std::ffi::c_void,
self.device_ptr,
bytesize,
2, );
if result != CUDA_SUCCESS {
return Err(LinalgError::ComputationError(format!(
"CUDA device-to-host copy failed: error code {}",
result
)));
}
Ok(())
}
fn device_ptr(&self) -> *mut std::ffi::c_void {
self.device_ptr
}
}
impl<T> Drop for CudaBuffer<T> {
fn drop(&mut self) {
if !self.device_ptr.is_null() {
let _ = cuda_free(self.device_ptr);
}
}
}
#[derive(Debug, Clone)]
pub struct CudaPerformanceStats {
pub kernel_launches: usize,
pub memory_transfers: usize,
pub total_compute_time_ms: f64,
pub total_transfer_time_ms: f64,
pub peak_memory_usage: usize,
pub average_occupancy: f64,
}
impl CudaPerformanceStats {
fn new() -> Self {
Self {
kernel_launches: 0,
memory_transfers: 0,
total_compute_time_ms: 0.0,
total_transfer_time_ms: 0.0,
peak_memory_usage: 0,
average_occupancy: 0.0,
}
}
pub fn compute_efficiency(&self) -> f64 {
if self.total_compute_time_ms + self.total_transfer_time_ms == 0.0 {
return 0.0;
}
self.total_compute_time_ms / (self.total_compute_time_ms + self.total_transfer_time_ms)
}
}
}
#[cfg(feature = "cuda")]
pub use cuda_impl::*;
#[cfg(not(feature = "cuda"))]
pub struct CudaBackend;
#[cfg(not(feature = "cuda"))]
impl CudaBackend {
pub fn new() -> LinalgResult<Self> {
Err(LinalgError::ComputationError(
"CUDA support not compiled in".to_string(),
))
}
}
#[cfg(not(feature = "cuda"))]
impl GpuBackend for CudaBackend {
fn name(&self) -> &str {
"CUDA (not available)"
}
fn is_available(&self) -> bool {
false
}
fn list_devices(&self) -> LinalgResult<Vec<GpuDeviceInfo>> {
Ok(vec![])
}
fn create_context(&self, _device_id: usize) -> LinalgResult<Box<dyn GpuContext>> {
Err(LinalgError::ComputationError(
"CUDA support not compiled in".to_string(),
))
}
}