use anyhow::{anyhow, Result};
#[derive(Debug, Clone)]
pub struct GpuDevice {
pub device_id: i32,
pub name: String,
pub compute_capability: (i32, i32),
pub total_memory: usize,
pub free_memory: usize,
pub max_threads_per_block: i32,
pub max_blocks_per_grid: i32,
pub warp_size: i32,
pub memory_bandwidth: f32,
pub peak_flops: f64,
}
impl GpuDevice {
pub fn performance_score(&self) -> f64 {
let memory_score = self.free_memory as f64 / (1024.0 * 1024.0 * 1024.0); let compute_score = self.peak_flops / 1e12; let bandwidth_score = self.memory_bandwidth as f64 / 1000.0;
memory_score * 0.3 + compute_score * 0.5 + bandwidth_score * 0.2
}
pub fn supports_compute_capability(&self, required: (i32, i32)) -> bool {
self.compute_capability.0 > required.0 ||
(self.compute_capability.0 == required.0 && self.compute_capability.1 >= required.1)
}
}
pub fn query_gpu_devices() -> Result<Vec<GpuDevice>> {
#[cfg(all(feature = "cuda", cuda_runtime_available))]
{
get_cuda_devices()
}
#[cfg(not(all(feature = "cuda", cuda_runtime_available)))]
{
Ok(Vec::new())
}
}
pub fn get_best_gpu_device() -> Result<GpuDevice> {
let devices = query_gpu_devices()?;
if devices.is_empty() {
return Err(anyhow!("No GPU devices available"));
}
let best_device = devices
.into_iter()
.max_by(|a, b| {
a.performance_score()
.partial_cmp(&b.performance_score())
.unwrap_or(std::cmp::Ordering::Equal)
})
.expect("devices validated to be non-empty");
Ok(best_device)
}
pub fn get_gpu_device(device_id: i32) -> Result<GpuDevice> {
let devices = query_gpu_devices()?;
devices
.into_iter()
.find(|d| d.device_id == device_id)
.ok_or_else(|| anyhow!("GPU device {} not found", device_id))
}
#[cfg(all(feature = "cuda", cuda_runtime_available))]
fn get_cuda_devices() -> Result<Vec<GpuDevice>> {
use cuda_runtime_sys::*;
let mut device_count: i32 = 0;
unsafe {
let result = cudaGetDeviceCount(&mut device_count);
if result != cudaError_t::cudaSuccess {
return Err(anyhow!("Failed to get CUDA device count"));
}
}
let mut devices = Vec::new();
for device_id in 0..device_count {
if let Ok(device) = get_cuda_device_info(device_id) {
devices.push(device);
}
}
Ok(devices)
}
#[cfg(all(feature = "cuda", cuda_runtime_available))]
fn get_cuda_device_info(device_id: i32) -> Result<GpuDevice> {
use cuda_runtime_sys::*;
let mut props: cudaDeviceProp = unsafe { std::mem::zeroed() };
unsafe {
let result = cudaGetDeviceProperties(&mut props, device_id);
if result != cudaError_t::cudaSuccess {
return Err(anyhow!("Failed to get device properties for device {}", device_id));
}
}
let mut free_memory: usize = 0;
let mut total_memory: usize = 0;
unsafe {
let result = cudaSetDevice(device_id);
if result != cudaError_t::cudaSuccess {
return Err(anyhow!("Failed to set device {}", device_id));
}
let result = cudaMemGetInfo(&mut free_memory, &mut total_memory);
if result != cudaError_t::cudaSuccess {
return Err(anyhow!("Failed to get memory info for device {}", device_id));
}
}
let name = unsafe {
std::ffi::CStr::from_ptr(props.name.as_ptr())
.to_string_lossy()
.into_owned()
};
let cores = estimate_cuda_cores(&props);
let clock_rate_ghz = props.clockRate as f64 / 1_000_000.0; let peak_flops = cores as f64 * clock_rate_ghz * 2.0 * 1_000_000_000.0;
Ok(GpuDevice {
device_id,
name,
compute_capability: (props.major, props.minor),
total_memory,
free_memory,
max_threads_per_block: props.maxThreadsPerBlock,
max_blocks_per_grid: props.maxGridSize[0],
warp_size: props.warpSize,
memory_bandwidth: props.memoryBusWidth as f32 * props.memoryClockRate as f32 * 2.0 / 8.0 / 1_000_000.0, peak_flops,
})
}
#[cfg(all(feature = "cuda", cuda_runtime_available))]
fn estimate_cuda_cores(props: &cuda_runtime_sys::cudaDeviceProp) -> i32 {
let sm_count = props.multiProcessorCount;
match (props.major, props.minor) {
(7, 5) => sm_count * 64, (7, 0) => sm_count * 64, (6, 1) => sm_count * 128, (6, 0) => sm_count * 64, (5, 2) => sm_count * 128, (5, 0) => sm_count * 128, (3, 7) => sm_count * 192, (3, 5) => sm_count * 192, _ => sm_count * 64, }
}