1#![allow(dead_code)]
2#[derive(Debug, Clone, PartialEq)]
10pub struct GpuSpec {
11 pub max_threads_per_sm: u32,
13 pub max_blocks_per_sm: u32,
15 pub max_shared_memory_per_sm: u32,
17 pub max_registers_per_sm: u32,
19 pub warp_size: u32,
21 pub sm_count: u32,
23}
24
25impl GpuSpec {
26 #[must_use]
28 pub fn mid_range() -> Self {
29 Self {
30 max_threads_per_sm: 1536,
31 max_blocks_per_sm: 16,
32 max_shared_memory_per_sm: 49152,
33 max_registers_per_sm: 65536,
34 warp_size: 32,
35 sm_count: 30,
36 }
37 }
38
39 #[must_use]
41 pub fn high_end() -> Self {
42 Self {
43 max_threads_per_sm: 2048,
44 max_blocks_per_sm: 32,
45 max_shared_memory_per_sm: 102400,
46 max_registers_per_sm: 65536,
47 warp_size: 32,
48 sm_count: 80,
49 }
50 }
51
52 #[must_use]
54 pub fn integrated() -> Self {
55 Self {
56 max_threads_per_sm: 512,
57 max_blocks_per_sm: 8,
58 max_shared_memory_per_sm: 32768,
59 max_registers_per_sm: 32768,
60 warp_size: 32,
61 sm_count: 8,
62 }
63 }
64}
65
66#[derive(Debug, Clone, PartialEq)]
68pub struct KernelResources {
69 pub threads_per_block: u32,
71 pub registers_per_thread: u32,
73 pub shared_memory_per_block: u32,
75}
76
77impl KernelResources {
78 #[must_use]
80 pub fn new(threads: u32, registers: u32, shared_mem: u32) -> Self {
81 Self {
82 threads_per_block: threads,
83 registers_per_thread: registers,
84 shared_memory_per_block: shared_mem,
85 }
86 }
87
88 #[must_use]
90 pub fn simple(threads: u32) -> Self {
91 Self {
92 threads_per_block: threads,
93 registers_per_thread: 32,
94 shared_memory_per_block: 0,
95 }
96 }
97}
98
99#[derive(Debug, Clone, PartialEq)]
101pub struct OccupancyResult {
102 pub occupancy: f64,
104 pub active_warps_per_sm: u32,
106 pub max_warps_per_sm: u32,
108 pub active_blocks_per_sm: u32,
110 pub limiting_factor: OccupancyLimit,
112}
113
114#[derive(Debug, Clone, Copy, PartialEq, Eq)]
116pub enum OccupancyLimit {
117 BlockCount,
119 ThreadCount,
121 Registers,
123 SharedMemory,
125 None,
127}
128
129impl std::fmt::Display for OccupancyLimit {
130 fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
131 match self {
132 Self::BlockCount => write!(f, "block count"),
133 Self::ThreadCount => write!(f, "thread count"),
134 Self::Registers => write!(f, "register usage"),
135 Self::SharedMemory => write!(f, "shared memory"),
136 Self::None => write!(f, "none"),
137 }
138 }
139}
140
141pub struct OccupancyCalculator;
143
144impl OccupancyCalculator {
145 #[allow(clippy::cast_precision_loss)]
147 #[allow(clippy::manual_checked_ops)]
148 #[must_use]
149 pub fn calculate(spec: &GpuSpec, kernel: &KernelResources) -> OccupancyResult {
150 if kernel.threads_per_block == 0 || spec.warp_size == 0 {
151 return OccupancyResult {
152 occupancy: 0.0,
153 active_warps_per_sm: 0,
154 max_warps_per_sm: 0,
155 active_blocks_per_sm: 0,
156 limiting_factor: OccupancyLimit::ThreadCount,
157 };
158 }
159
160 let max_warps = spec.max_threads_per_sm / spec.warp_size;
161 let warps_per_block = kernel.threads_per_block.div_ceil(spec.warp_size);
162
163 let blocks_by_count = spec.max_blocks_per_sm;
165
166 let blocks_by_threads = max_warps.checked_div(warps_per_block).unwrap_or(0);
168
169 let blocks_by_registers = if kernel.registers_per_thread > 0 {
171 let regs_per_block = kernel.registers_per_thread * kernel.threads_per_block;
172 spec.max_registers_per_sm
173 .checked_div(regs_per_block)
174 .unwrap_or(blocks_by_count)
175 } else {
176 blocks_by_count
177 };
178
179 let blocks_by_shared = spec
181 .max_shared_memory_per_sm
182 .checked_div(kernel.shared_memory_per_block)
183 .unwrap_or(blocks_by_count);
184
185 let active_blocks = blocks_by_count
187 .min(blocks_by_threads)
188 .min(blocks_by_registers)
189 .min(blocks_by_shared);
190
191 let limiting_factor = if active_blocks == 0 {
192 OccupancyLimit::ThreadCount
193 } else if active_blocks == blocks_by_shared && blocks_by_shared < blocks_by_count {
194 OccupancyLimit::SharedMemory
195 } else if active_blocks == blocks_by_registers && blocks_by_registers < blocks_by_count {
196 OccupancyLimit::Registers
197 } else if active_blocks == blocks_by_threads && blocks_by_threads < blocks_by_count {
198 OccupancyLimit::ThreadCount
199 } else if active_blocks == blocks_by_count {
200 OccupancyLimit::BlockCount
201 } else {
202 OccupancyLimit::None
203 };
204
205 let active_warps = active_blocks * warps_per_block;
206 let occupancy = if max_warps > 0 {
207 f64::from(active_warps) / f64::from(max_warps)
208 } else {
209 0.0
210 };
211 let occupancy = occupancy.min(1.0);
212
213 OccupancyResult {
214 occupancy,
215 active_warps_per_sm: active_warps.min(max_warps),
216 max_warps_per_sm: max_warps,
217 active_blocks_per_sm: active_blocks,
218 limiting_factor,
219 }
220 }
221
222 #[allow(clippy::cast_precision_loss)]
226 #[must_use]
227 pub fn find_optimal_block_size(
228 spec: &GpuSpec,
229 registers_per_thread: u32,
230 shared_memory_per_block: u32,
231 ) -> u32 {
232 let mut best_occupancy = 0.0_f64;
233 let mut best_size = spec.warp_size;
234
235 let max_threads = spec.max_threads_per_sm.min(1024);
236 let mut threads = spec.warp_size;
237
238 while threads <= max_threads {
239 let kernel =
240 KernelResources::new(threads, registers_per_thread, shared_memory_per_block);
241 let result = Self::calculate(spec, &kernel);
242 if result.occupancy > best_occupancy {
243 best_occupancy = result.occupancy;
244 best_size = threads;
245 }
246 threads += spec.warp_size;
247 }
248
249 best_size
250 }
251
252 #[allow(clippy::cast_precision_loss)]
254 #[must_use]
255 pub fn estimate_bandwidth(
256 occupancy: f64,
257 peak_bandwidth_gbps: f64,
258 memory_intensity: f64,
259 ) -> f64 {
260 let eff = occupancy.clamp(0.0, 1.0);
261 let intensity = memory_intensity.clamp(0.0, 1.0);
262 peak_bandwidth_gbps * eff * intensity
263 }
264}
265
266#[derive(Debug, Clone, PartialEq, Eq)]
268pub struct PerformanceTip {
269 pub message: String,
271 pub priority: u32,
273}
274
275impl PerformanceTip {
276 #[must_use]
278 pub fn new(message: &str, priority: u32) -> Self {
279 Self {
280 message: message.to_string(),
281 priority,
282 }
283 }
284}
285
286#[must_use]
288pub fn analyze_performance(result: &OccupancyResult) -> Vec<PerformanceTip> {
289 let mut tips = Vec::new();
290
291 if result.occupancy < 0.25 {
292 tips.push(PerformanceTip::new(
293 "Very low occupancy. Consider reducing resource usage per thread.",
294 1,
295 ));
296 } else if result.occupancy < 0.5 {
297 tips.push(PerformanceTip::new(
298 "Low occupancy. Adjusting block size or register usage may help.",
299 2,
300 ));
301 }
302
303 match result.limiting_factor {
304 OccupancyLimit::Registers => {
305 tips.push(PerformanceTip::new(
306 "Register usage is the bottleneck. Consider reducing local variables.",
307 2,
308 ));
309 }
310 OccupancyLimit::SharedMemory => {
311 tips.push(PerformanceTip::new(
312 "Shared memory is the bottleneck. Consider reducing shared memory usage.",
313 2,
314 ));
315 }
316 _ => {}
317 }
318
319 if result.occupancy >= 0.75 {
320 tips.push(PerformanceTip::new(
321 "Good occupancy. Focus on memory access patterns and instruction throughput.",
322 3,
323 ));
324 }
325
326 tips
327}
328
329#[cfg(test)]
330mod tests {
331 use super::*;
332
333 #[test]
334 fn test_gpu_spec_mid_range() {
335 let spec = GpuSpec::mid_range();
336 assert_eq!(spec.max_threads_per_sm, 1536);
337 assert_eq!(spec.warp_size, 32);
338 }
339
340 #[test]
341 fn test_gpu_spec_high_end() {
342 let spec = GpuSpec::high_end();
343 assert_eq!(spec.max_threads_per_sm, 2048);
344 assert_eq!(spec.sm_count, 80);
345 }
346
347 #[test]
348 fn test_gpu_spec_integrated() {
349 let spec = GpuSpec::integrated();
350 assert_eq!(spec.max_threads_per_sm, 512);
351 assert_eq!(spec.sm_count, 8);
352 }
353
354 #[test]
355 fn test_kernel_resources_simple() {
356 let k = KernelResources::simple(256);
357 assert_eq!(k.threads_per_block, 256);
358 assert_eq!(k.registers_per_thread, 32);
359 assert_eq!(k.shared_memory_per_block, 0);
360 }
361
362 #[test]
363 fn test_occupancy_simple_kernel() {
364 let spec = GpuSpec::mid_range();
365 let kernel = KernelResources::simple(256);
366 let result = OccupancyCalculator::calculate(&spec, &kernel);
367 assert!(result.occupancy > 0.0);
368 assert!(result.occupancy <= 1.0);
369 assert!(result.active_warps_per_sm > 0);
370 }
371
372 #[test]
373 fn test_occupancy_heavy_registers() {
374 let spec = GpuSpec::mid_range();
375 let kernel = KernelResources::new(256, 128, 0);
376 let result = OccupancyCalculator::calculate(&spec, &kernel);
377 assert!(result.occupancy <= 1.0);
379 }
380
381 #[test]
382 fn test_occupancy_heavy_shared_memory() {
383 let spec = GpuSpec::mid_range();
384 let kernel = KernelResources::new(256, 32, 32768);
385 let result = OccupancyCalculator::calculate(&spec, &kernel);
386 assert!(result.occupancy > 0.0);
387 assert!(result.limiting_factor == OccupancyLimit::SharedMemory);
388 }
389
390 #[test]
391 fn test_occupancy_zero_threads() {
392 let spec = GpuSpec::mid_range();
393 let kernel = KernelResources::new(0, 32, 0);
394 let result = OccupancyCalculator::calculate(&spec, &kernel);
395 assert!((result.occupancy - 0.0).abs() < f64::EPSILON);
396 }
397
398 #[test]
399 fn test_find_optimal_block_size() {
400 let spec = GpuSpec::mid_range();
401 let optimal = OccupancyCalculator::find_optimal_block_size(&spec, 32, 0);
402 assert!(optimal >= spec.warp_size);
403 assert!(optimal <= spec.max_threads_per_sm);
404 assert_eq!(optimal % spec.warp_size, 0);
405 }
406
407 #[test]
408 fn test_estimate_bandwidth() {
409 let bw = OccupancyCalculator::estimate_bandwidth(1.0, 500.0, 1.0);
410 assert!((bw - 500.0).abs() < f64::EPSILON);
411
412 let bw2 = OccupancyCalculator::estimate_bandwidth(0.5, 500.0, 0.8);
413 assert!((bw2 - 200.0).abs() < f64::EPSILON);
414 }
415
416 #[test]
417 fn test_estimate_bandwidth_clamping() {
418 let bw = OccupancyCalculator::estimate_bandwidth(2.0, 500.0, 1.5);
419 assert!((bw - 500.0).abs() < f64::EPSILON);
420 }
421
422 #[test]
423 fn test_performance_tips_low_occupancy() {
424 let result = OccupancyResult {
425 occupancy: 0.1,
426 active_warps_per_sm: 4,
427 max_warps_per_sm: 48,
428 active_blocks_per_sm: 1,
429 limiting_factor: OccupancyLimit::Registers,
430 };
431 let tips = analyze_performance(&result);
432 assert!(!tips.is_empty());
433 assert!(tips.iter().any(|t| t.message.contains("Very low")));
434 }
435
436 #[test]
437 fn test_performance_tips_good_occupancy() {
438 let result = OccupancyResult {
439 occupancy: 0.8,
440 active_warps_per_sm: 38,
441 max_warps_per_sm: 48,
442 active_blocks_per_sm: 6,
443 limiting_factor: OccupancyLimit::None,
444 };
445 let tips = analyze_performance(&result);
446 assert!(tips.iter().any(|t| t.message.contains("Good occupancy")));
447 }
448
449 #[test]
450 fn test_occupancy_limit_display() {
451 assert_eq!(format!("{}", OccupancyLimit::BlockCount), "block count");
452 assert_eq!(format!("{}", OccupancyLimit::Registers), "register usage");
453 assert_eq!(format!("{}", OccupancyLimit::SharedMemory), "shared memory");
454 }
455
456 #[test]
457 fn test_performance_tip_creation() {
458 let tip = PerformanceTip::new("test tip", 5);
459 assert_eq!(tip.message, "test tip");
460 assert_eq!(tip.priority, 5);
461 }
462}