1#![allow(dead_code)]
2#[derive(Debug, Clone, PartialEq)]
10pub struct GpuSpec {
11 pub max_threads_per_sm: u32,
13 pub max_blocks_per_sm: u32,
15 pub max_shared_memory_per_sm: u32,
17 pub max_registers_per_sm: u32,
19 pub warp_size: u32,
21 pub sm_count: u32,
23}
24
25impl GpuSpec {
26 #[must_use]
28 pub fn mid_range() -> Self {
29 Self {
30 max_threads_per_sm: 1536,
31 max_blocks_per_sm: 16,
32 max_shared_memory_per_sm: 49152,
33 max_registers_per_sm: 65536,
34 warp_size: 32,
35 sm_count: 30,
36 }
37 }
38
39 #[must_use]
41 pub fn high_end() -> Self {
42 Self {
43 max_threads_per_sm: 2048,
44 max_blocks_per_sm: 32,
45 max_shared_memory_per_sm: 102400,
46 max_registers_per_sm: 65536,
47 warp_size: 32,
48 sm_count: 80,
49 }
50 }
51
52 #[must_use]
54 pub fn integrated() -> Self {
55 Self {
56 max_threads_per_sm: 512,
57 max_blocks_per_sm: 8,
58 max_shared_memory_per_sm: 32768,
59 max_registers_per_sm: 32768,
60 warp_size: 32,
61 sm_count: 8,
62 }
63 }
64}
65
66#[derive(Debug, Clone, PartialEq)]
68pub struct KernelResources {
69 pub threads_per_block: u32,
71 pub registers_per_thread: u32,
73 pub shared_memory_per_block: u32,
75}
76
77impl KernelResources {
78 #[must_use]
80 pub fn new(threads: u32, registers: u32, shared_mem: u32) -> Self {
81 Self {
82 threads_per_block: threads,
83 registers_per_thread: registers,
84 shared_memory_per_block: shared_mem,
85 }
86 }
87
88 #[must_use]
90 pub fn simple(threads: u32) -> Self {
91 Self {
92 threads_per_block: threads,
93 registers_per_thread: 32,
94 shared_memory_per_block: 0,
95 }
96 }
97}
98
99#[derive(Debug, Clone, PartialEq)]
101pub struct OccupancyResult {
102 pub occupancy: f64,
104 pub active_warps_per_sm: u32,
106 pub max_warps_per_sm: u32,
108 pub active_blocks_per_sm: u32,
110 pub limiting_factor: OccupancyLimit,
112}
113
114#[derive(Debug, Clone, Copy, PartialEq, Eq)]
116pub enum OccupancyLimit {
117 BlockCount,
119 ThreadCount,
121 Registers,
123 SharedMemory,
125 None,
127}
128
129impl std::fmt::Display for OccupancyLimit {
130 fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
131 match self {
132 Self::BlockCount => write!(f, "block count"),
133 Self::ThreadCount => write!(f, "thread count"),
134 Self::Registers => write!(f, "register usage"),
135 Self::SharedMemory => write!(f, "shared memory"),
136 Self::None => write!(f, "none"),
137 }
138 }
139}
140
141pub struct OccupancyCalculator;
143
144impl OccupancyCalculator {
145 #[allow(clippy::cast_precision_loss)]
147 #[must_use]
148 pub fn calculate(spec: &GpuSpec, kernel: &KernelResources) -> OccupancyResult {
149 if kernel.threads_per_block == 0 || spec.warp_size == 0 {
150 return OccupancyResult {
151 occupancy: 0.0,
152 active_warps_per_sm: 0,
153 max_warps_per_sm: 0,
154 active_blocks_per_sm: 0,
155 limiting_factor: OccupancyLimit::ThreadCount,
156 };
157 }
158
159 let max_warps = spec.max_threads_per_sm / spec.warp_size;
160 let warps_per_block = kernel.threads_per_block.div_ceil(spec.warp_size);
161
162 let blocks_by_count = spec.max_blocks_per_sm;
164
165 let blocks_by_threads = if warps_per_block > 0 {
167 max_warps / warps_per_block
168 } else {
169 0
170 };
171
172 let blocks_by_registers = if kernel.registers_per_thread > 0 {
174 let regs_per_block = kernel.registers_per_thread * kernel.threads_per_block;
175 if regs_per_block > 0 {
176 spec.max_registers_per_sm / regs_per_block
177 } else {
178 blocks_by_count
179 }
180 } else {
181 blocks_by_count
182 };
183
184 let blocks_by_shared = if kernel.shared_memory_per_block > 0 {
186 spec.max_shared_memory_per_sm / kernel.shared_memory_per_block
187 } else {
188 blocks_by_count
189 };
190
191 let active_blocks = blocks_by_count
193 .min(blocks_by_threads)
194 .min(blocks_by_registers)
195 .min(blocks_by_shared);
196
197 let limiting_factor = if active_blocks == 0 {
198 OccupancyLimit::ThreadCount
199 } else if active_blocks == blocks_by_shared && blocks_by_shared < blocks_by_count {
200 OccupancyLimit::SharedMemory
201 } else if active_blocks == blocks_by_registers && blocks_by_registers < blocks_by_count {
202 OccupancyLimit::Registers
203 } else if active_blocks == blocks_by_threads && blocks_by_threads < blocks_by_count {
204 OccupancyLimit::ThreadCount
205 } else if active_blocks == blocks_by_count {
206 OccupancyLimit::BlockCount
207 } else {
208 OccupancyLimit::None
209 };
210
211 let active_warps = active_blocks * warps_per_block;
212 let occupancy = if max_warps > 0 {
213 f64::from(active_warps) / f64::from(max_warps)
214 } else {
215 0.0
216 };
217 let occupancy = occupancy.min(1.0);
218
219 OccupancyResult {
220 occupancy,
221 active_warps_per_sm: active_warps.min(max_warps),
222 max_warps_per_sm: max_warps,
223 active_blocks_per_sm: active_blocks,
224 limiting_factor,
225 }
226 }
227
228 #[allow(clippy::cast_precision_loss)]
232 #[must_use]
233 pub fn find_optimal_block_size(
234 spec: &GpuSpec,
235 registers_per_thread: u32,
236 shared_memory_per_block: u32,
237 ) -> u32 {
238 let mut best_occupancy = 0.0_f64;
239 let mut best_size = spec.warp_size;
240
241 let max_threads = spec.max_threads_per_sm.min(1024);
242 let mut threads = spec.warp_size;
243
244 while threads <= max_threads {
245 let kernel =
246 KernelResources::new(threads, registers_per_thread, shared_memory_per_block);
247 let result = Self::calculate(spec, &kernel);
248 if result.occupancy > best_occupancy {
249 best_occupancy = result.occupancy;
250 best_size = threads;
251 }
252 threads += spec.warp_size;
253 }
254
255 best_size
256 }
257
258 #[allow(clippy::cast_precision_loss)]
260 #[must_use]
261 pub fn estimate_bandwidth(
262 occupancy: f64,
263 peak_bandwidth_gbps: f64,
264 memory_intensity: f64,
265 ) -> f64 {
266 let eff = occupancy.clamp(0.0, 1.0);
267 let intensity = memory_intensity.clamp(0.0, 1.0);
268 peak_bandwidth_gbps * eff * intensity
269 }
270}
271
272#[derive(Debug, Clone, PartialEq, Eq)]
274pub struct PerformanceTip {
275 pub message: String,
277 pub priority: u32,
279}
280
281impl PerformanceTip {
282 #[must_use]
284 pub fn new(message: &str, priority: u32) -> Self {
285 Self {
286 message: message.to_string(),
287 priority,
288 }
289 }
290}
291
292#[must_use]
294pub fn analyze_performance(result: &OccupancyResult) -> Vec<PerformanceTip> {
295 let mut tips = Vec::new();
296
297 if result.occupancy < 0.25 {
298 tips.push(PerformanceTip::new(
299 "Very low occupancy. Consider reducing resource usage per thread.",
300 1,
301 ));
302 } else if result.occupancy < 0.5 {
303 tips.push(PerformanceTip::new(
304 "Low occupancy. Adjusting block size or register usage may help.",
305 2,
306 ));
307 }
308
309 match result.limiting_factor {
310 OccupancyLimit::Registers => {
311 tips.push(PerformanceTip::new(
312 "Register usage is the bottleneck. Consider reducing local variables.",
313 2,
314 ));
315 }
316 OccupancyLimit::SharedMemory => {
317 tips.push(PerformanceTip::new(
318 "Shared memory is the bottleneck. Consider reducing shared memory usage.",
319 2,
320 ));
321 }
322 _ => {}
323 }
324
325 if result.occupancy >= 0.75 {
326 tips.push(PerformanceTip::new(
327 "Good occupancy. Focus on memory access patterns and instruction throughput.",
328 3,
329 ));
330 }
331
332 tips
333}
334
335#[cfg(test)]
336mod tests {
337 use super::*;
338
339 #[test]
340 fn test_gpu_spec_mid_range() {
341 let spec = GpuSpec::mid_range();
342 assert_eq!(spec.max_threads_per_sm, 1536);
343 assert_eq!(spec.warp_size, 32);
344 }
345
346 #[test]
347 fn test_gpu_spec_high_end() {
348 let spec = GpuSpec::high_end();
349 assert_eq!(spec.max_threads_per_sm, 2048);
350 assert_eq!(spec.sm_count, 80);
351 }
352
353 #[test]
354 fn test_gpu_spec_integrated() {
355 let spec = GpuSpec::integrated();
356 assert_eq!(spec.max_threads_per_sm, 512);
357 assert_eq!(spec.sm_count, 8);
358 }
359
360 #[test]
361 fn test_kernel_resources_simple() {
362 let k = KernelResources::simple(256);
363 assert_eq!(k.threads_per_block, 256);
364 assert_eq!(k.registers_per_thread, 32);
365 assert_eq!(k.shared_memory_per_block, 0);
366 }
367
368 #[test]
369 fn test_occupancy_simple_kernel() {
370 let spec = GpuSpec::mid_range();
371 let kernel = KernelResources::simple(256);
372 let result = OccupancyCalculator::calculate(&spec, &kernel);
373 assert!(result.occupancy > 0.0);
374 assert!(result.occupancy <= 1.0);
375 assert!(result.active_warps_per_sm > 0);
376 }
377
378 #[test]
379 fn test_occupancy_heavy_registers() {
380 let spec = GpuSpec::mid_range();
381 let kernel = KernelResources::new(256, 128, 0);
382 let result = OccupancyCalculator::calculate(&spec, &kernel);
383 assert!(result.occupancy <= 1.0);
385 }
386
387 #[test]
388 fn test_occupancy_heavy_shared_memory() {
389 let spec = GpuSpec::mid_range();
390 let kernel = KernelResources::new(256, 32, 32768);
391 let result = OccupancyCalculator::calculate(&spec, &kernel);
392 assert!(result.occupancy > 0.0);
393 assert!(result.limiting_factor == OccupancyLimit::SharedMemory);
394 }
395
396 #[test]
397 fn test_occupancy_zero_threads() {
398 let spec = GpuSpec::mid_range();
399 let kernel = KernelResources::new(0, 32, 0);
400 let result = OccupancyCalculator::calculate(&spec, &kernel);
401 assert!((result.occupancy - 0.0).abs() < f64::EPSILON);
402 }
403
404 #[test]
405 fn test_find_optimal_block_size() {
406 let spec = GpuSpec::mid_range();
407 let optimal = OccupancyCalculator::find_optimal_block_size(&spec, 32, 0);
408 assert!(optimal >= spec.warp_size);
409 assert!(optimal <= spec.max_threads_per_sm);
410 assert_eq!(optimal % spec.warp_size, 0);
411 }
412
413 #[test]
414 fn test_estimate_bandwidth() {
415 let bw = OccupancyCalculator::estimate_bandwidth(1.0, 500.0, 1.0);
416 assert!((bw - 500.0).abs() < f64::EPSILON);
417
418 let bw2 = OccupancyCalculator::estimate_bandwidth(0.5, 500.0, 0.8);
419 assert!((bw2 - 200.0).abs() < f64::EPSILON);
420 }
421
422 #[test]
423 fn test_estimate_bandwidth_clamping() {
424 let bw = OccupancyCalculator::estimate_bandwidth(2.0, 500.0, 1.5);
425 assert!((bw - 500.0).abs() < f64::EPSILON);
426 }
427
428 #[test]
429 fn test_performance_tips_low_occupancy() {
430 let result = OccupancyResult {
431 occupancy: 0.1,
432 active_warps_per_sm: 4,
433 max_warps_per_sm: 48,
434 active_blocks_per_sm: 1,
435 limiting_factor: OccupancyLimit::Registers,
436 };
437 let tips = analyze_performance(&result);
438 assert!(!tips.is_empty());
439 assert!(tips.iter().any(|t| t.message.contains("Very low")));
440 }
441
442 #[test]
443 fn test_performance_tips_good_occupancy() {
444 let result = OccupancyResult {
445 occupancy: 0.8,
446 active_warps_per_sm: 38,
447 max_warps_per_sm: 48,
448 active_blocks_per_sm: 6,
449 limiting_factor: OccupancyLimit::None,
450 };
451 let tips = analyze_performance(&result);
452 assert!(tips.iter().any(|t| t.message.contains("Good occupancy")));
453 }
454
455 #[test]
456 fn test_occupancy_limit_display() {
457 assert_eq!(format!("{}", OccupancyLimit::BlockCount), "block count");
458 assert_eq!(format!("{}", OccupancyLimit::Registers), "register usage");
459 assert_eq!(format!("{}", OccupancyLimit::SharedMemory), "shared memory");
460 }
461
462 #[test]
463 fn test_performance_tip_creation() {
464 let tip = PerformanceTip::new("test tip", 5);
465 assert_eq!(tip.message, "test tip");
466 assert_eq!(tip.priority, 5);
467 }
468}