1#[cfg(all(feature = "cuda", cuda_runtime_available))]
4use anyhow::anyhow;
5use anyhow::Result;
6
7#[derive(Debug, Clone)]
9pub struct GpuDevice {
10 pub device_id: i32,
11 pub name: String,
12 pub compute_capability: (i32, i32),
13 pub total_memory: usize,
14 pub free_memory: usize,
15 pub max_threads_per_block: i32,
16 pub max_blocks_per_grid: i32,
17 pub warp_size: i32,
18 pub memory_bandwidth: f32,
19 pub peak_flops: f64,
20}
21
22impl GpuDevice {
23 pub fn get_device_info(device_id: i32) -> Result<Self> {
25 #[cfg(all(feature = "cuda", cuda_runtime_available))]
26 {
27 use cuda_runtime_sys::*;
28 unsafe {
29 let result = cudaSetDevice(device_id);
30 if result != cudaError_t::cudaSuccess {
31 return Err(anyhow!("Failed to set CUDA device {}", device_id));
32 }
33
34 let mut props: cudaDeviceProp = std::mem::zeroed();
35 let result = cudaGetDeviceProperties(&mut props, device_id);
36 if result != cudaError_t::cudaSuccess {
37 return Err(anyhow!("Failed to get device properties"));
38 }
39
40 let mut free_mem: usize = 0;
41 let mut total_mem: usize = 0;
42 let result = cudaMemGetInfo(&mut free_mem, &mut total_mem);
43 if result != cudaError_t::cudaSuccess {
44 return Err(anyhow!("Failed to get memory info"));
45 }
46
47 Ok(Self {
48 device_id,
49 name: std::ffi::CStr::from_ptr(props.name.as_ptr())
50 .to_string_lossy()
51 .to_string(),
52 compute_capability: (props.major, props.minor),
53 total_memory: total_mem,
54 free_memory: free_mem,
55 max_threads_per_block: props.maxThreadsPerBlock,
56 max_blocks_per_grid: props.maxGridSize[0],
57 warp_size: props.warpSize,
58 memory_bandwidth: props.memoryBusWidth as f32
59 * props.memoryClockRate as f32
60 * 2.0
61 / 8.0
62 / 1e6,
63 peak_flops: props.clockRate as f64
64 * props.multiProcessorCount as f64
65 * props.maxThreadsPerMultiProcessor as f64
66 / 1e6,
67 })
68 }
69 }
70
71 #[cfg(not(all(feature = "cuda", cuda_runtime_available)))]
72 {
73 tracing::warn!("CUDA not available - using simulated GPU device");
75 Ok(Self {
76 device_id,
77 name: format!("Simulated GPU {device_id}"),
78 compute_capability: (7, 5), total_memory: 8 * 1024 * 1024 * 1024, free_memory: 6 * 1024 * 1024 * 1024, max_threads_per_block: 1024,
82 max_blocks_per_grid: 65535,
83 warp_size: 32,
84 memory_bandwidth: 900.0, peak_flops: 14000.0, })
87 }
88 }
89
90 pub fn get_all_devices() -> Result<Vec<Self>> {
92 #[cfg(all(feature = "cuda", cuda_runtime_available))]
93 {
94 use cuda_runtime_sys::*;
95 unsafe {
96 let mut device_count: i32 = 0;
97 let result = cudaGetDeviceCount(&mut device_count);
98 if result != cudaError_t::cudaSuccess {
99 return Err(anyhow!("Failed to get device count"));
100 }
101
102 let mut devices = Vec::new();
103 for i in 0..device_count {
104 if let Ok(device) = Self::get_device_info(i) {
105 devices.push(device);
106 }
107 }
108 Ok(devices)
109 }
110 }
111
112 #[cfg(not(all(feature = "cuda", cuda_runtime_available)))]
113 {
114 tracing::warn!("CUDA not available - using simulated GPU devices");
116 Ok(vec![Self::get_device_info(0)?, Self::get_device_info(1)?])
117 }
118 }
119
120 pub fn supports_compute_capability(&self, major: i32, minor: i32) -> bool {
122 self.compute_capability.0 > major
123 || (self.compute_capability.0 == major && self.compute_capability.1 >= minor)
124 }
125
126 pub fn peak_memory_bandwidth(&self) -> f32 {
128 self.memory_bandwidth
129 }
130
131 pub fn peak_compute_performance(&self) -> f64 {
133 self.peak_flops
134 }
135
136 pub fn calculate_optimal_block_config(&self, problem_size: usize) -> (i32, i32) {
138 let optimal_threads = (self.max_threads_per_block as f32 * 0.75) as i32; let blocks_needed = ((problem_size as f32) / (optimal_threads as f32)).ceil() as i32;
140 let blocks = blocks_needed.min(self.max_blocks_per_grid);
141 (blocks, optimal_threads)
142 }
143}