#[cfg(all(feature = "cuda", cuda_runtime_available))]
use anyhow::anyhow;
use anyhow::Result;
#[derive(Debug, Clone)]
pub struct GpuDevice {
pub device_id: i32,
pub name: String,
pub compute_capability: (i32, i32),
pub total_memory: usize,
pub free_memory: usize,
pub max_threads_per_block: i32,
pub max_blocks_per_grid: i32,
pub warp_size: i32,
pub memory_bandwidth: f32,
pub peak_flops: f64,
}
impl GpuDevice {
fn simulated(device_id: i32) -> Self {
Self {
device_id,
name: format!("Simulated GPU {device_id}"),
compute_capability: (7, 5),
total_memory: 8 * 1024 * 1024 * 1024,
free_memory: 6 * 1024 * 1024 * 1024,
max_threads_per_block: 1024,
max_blocks_per_grid: 65535,
warp_size: 32,
memory_bandwidth: 900.0,
peak_flops: 14000.0,
}
}
pub fn get_device_info(device_id: i32) -> Result<Self> {
#[cfg(all(feature = "cuda", cuda_runtime_available))]
{
use cuda_runtime_sys::*;
unsafe {
let result = cudaSetDevice(device_id);
if result != cudaError_t::cudaSuccess {
tracing::warn!(
"CUDA device {} not available - using simulated GPU device",
device_id
);
return Ok(Self::simulated(device_id));
}
let mut props: cudaDeviceProp = std::mem::zeroed();
let result = cudaGetDeviceProperties(&mut props, device_id);
if result != cudaError_t::cudaSuccess {
tracing::warn!(
"Failed to get properties for CUDA device {} - using simulated GPU device",
device_id
);
return Ok(Self::simulated(device_id));
}
let mut free_mem: usize = 0;
let mut total_mem: usize = 0;
let result = cudaMemGetInfo(&mut free_mem, &mut total_mem);
if result != cudaError_t::cudaSuccess {
tracing::warn!(
"Failed to get memory info for CUDA device {} - using simulated GPU device",
device_id
);
return Ok(Self::simulated(device_id));
}
Ok(Self {
device_id,
name: std::ffi::CStr::from_ptr(props.name.as_ptr())
.to_string_lossy()
.to_string(),
compute_capability: (props.major, props.minor),
total_memory: total_mem,
free_memory: free_mem,
max_threads_per_block: props.maxThreadsPerBlock,
max_blocks_per_grid: props.maxGridSize[0],
warp_size: props.warpSize,
memory_bandwidth: props.memoryBusWidth as f32
* props.memoryClockRate as f32
* 2.0
/ 8.0
/ 1e6,
peak_flops: props.clockRate as f64
* props.multiProcessorCount as f64
* props.maxThreadsPerMultiProcessor as f64
/ 1e6,
})
}
}
#[cfg(not(all(feature = "cuda", cuda_runtime_available)))]
{
tracing::warn!("CUDA not available - using simulated GPU device");
Ok(Self::simulated(device_id))
}
}
pub fn get_all_devices() -> Result<Vec<Self>> {
#[cfg(all(feature = "cuda", cuda_runtime_available))]
{
use cuda_runtime_sys::*;
unsafe {
let mut device_count: i32 = 0;
let result = cudaGetDeviceCount(&mut device_count);
if result != cudaError_t::cudaSuccess {
return Err(anyhow!("Failed to get device count"));
}
let mut devices = Vec::new();
for i in 0..device_count {
if let Ok(device) = Self::get_device_info(i) {
devices.push(device);
}
}
Ok(devices)
}
}
#[cfg(not(all(feature = "cuda", cuda_runtime_available)))]
{
tracing::warn!("CUDA not available - using simulated GPU devices");
Ok(vec![Self::get_device_info(0)?, Self::get_device_info(1)?])
}
}
pub fn supports_compute_capability(&self, major: i32, minor: i32) -> bool {
self.compute_capability.0 > major
|| (self.compute_capability.0 == major && self.compute_capability.1 >= minor)
}
pub fn peak_memory_bandwidth(&self) -> f32 {
self.memory_bandwidth
}
pub fn peak_compute_performance(&self) -> f64 {
self.peak_flops
}
pub fn calculate_optimal_block_config(&self, problem_size: usize) -> (i32, i32) {
let optimal_threads = (self.max_threads_per_block as f32 * 0.75) as i32; let blocks_needed = ((problem_size as f32) / (optimal_threads as f32)).ceil() as i32;
let blocks = blocks_needed.min(self.max_blocks_per_grid);
(blocks, optimal_threads)
}
}