Skip to main content

oximedia_gpu/
occupancy.rs

1#![allow(dead_code)]
2//! GPU occupancy calculator and optimization.
3//!
4//! This module provides tools for estimating GPU occupancy based on
5//! workgroup size, register usage, and shared memory consumption.
6//! Higher occupancy generally means better latency hiding.
7
8/// Typical GPU multiprocessor specification.
9#[derive(Debug, Clone, PartialEq)]
10pub struct GpuSpec {
11    /// Maximum threads per multiprocessor.
12    pub max_threads_per_sm: u32,
13    /// Maximum workgroups (blocks) per multiprocessor.
14    pub max_blocks_per_sm: u32,
15    /// Maximum shared memory per multiprocessor in bytes.
16    pub max_shared_memory_per_sm: u32,
17    /// Maximum registers per multiprocessor.
18    pub max_registers_per_sm: u32,
19    /// Warp size (wavefront size).
20    pub warp_size: u32,
21    /// Total number of multiprocessors on the device.
22    pub sm_count: u32,
23}
24
25impl GpuSpec {
26    /// Create a spec representing a mid-range desktop GPU.
27    #[must_use]
28    pub fn mid_range() -> Self {
29        Self {
30            max_threads_per_sm: 1536,
31            max_blocks_per_sm: 16,
32            max_shared_memory_per_sm: 49152,
33            max_registers_per_sm: 65536,
34            warp_size: 32,
35            sm_count: 30,
36        }
37    }
38
39    /// Create a spec representing a high-end desktop GPU.
40    #[must_use]
41    pub fn high_end() -> Self {
42        Self {
43            max_threads_per_sm: 2048,
44            max_blocks_per_sm: 32,
45            max_shared_memory_per_sm: 102400,
46            max_registers_per_sm: 65536,
47            warp_size: 32,
48            sm_count: 80,
49        }
50    }
51
52    /// Create a spec representing integrated/mobile GPU.
53    #[must_use]
54    pub fn integrated() -> Self {
55        Self {
56            max_threads_per_sm: 512,
57            max_blocks_per_sm: 8,
58            max_shared_memory_per_sm: 32768,
59            max_registers_per_sm: 32768,
60            warp_size: 32,
61            sm_count: 8,
62        }
63    }
64}
65
66/// Kernel resource requirements.
67#[derive(Debug, Clone, PartialEq)]
68pub struct KernelResources {
69    /// Threads per workgroup (block).
70    pub threads_per_block: u32,
71    /// Registers used per thread.
72    pub registers_per_thread: u32,
73    /// Shared memory used per block in bytes.
74    pub shared_memory_per_block: u32,
75}
76
77impl KernelResources {
78    /// Create a new kernel resource descriptor.
79    #[must_use]
80    pub fn new(threads: u32, registers: u32, shared_mem: u32) -> Self {
81        Self {
82            threads_per_block: threads,
83            registers_per_thread: registers,
84            shared_memory_per_block: shared_mem,
85        }
86    }
87
88    /// Create a simple kernel with typical register usage.
89    #[must_use]
90    pub fn simple(threads: u32) -> Self {
91        Self {
92            threads_per_block: threads,
93            registers_per_thread: 32,
94            shared_memory_per_block: 0,
95        }
96    }
97}
98
99/// Result of occupancy calculation.
100#[derive(Debug, Clone, PartialEq)]
101pub struct OccupancyResult {
102    /// Achieved occupancy ratio (0.0 to 1.0).
103    pub occupancy: f64,
104    /// Active warps per SM.
105    pub active_warps_per_sm: u32,
106    /// Maximum possible warps per SM.
107    pub max_warps_per_sm: u32,
108    /// Active blocks per SM.
109    pub active_blocks_per_sm: u32,
110    /// Limiting factor for occupancy.
111    pub limiting_factor: OccupancyLimit,
112}
113
114/// The factor that limits GPU occupancy.
115#[derive(Debug, Clone, Copy, PartialEq, Eq)]
116pub enum OccupancyLimit {
117    /// Limited by max blocks per SM.
118    BlockCount,
119    /// Limited by thread count.
120    ThreadCount,
121    /// Limited by register usage.
122    Registers,
123    /// Limited by shared memory.
124    SharedMemory,
125    /// No limiting factor (full occupancy).
126    None,
127}
128
129impl std::fmt::Display for OccupancyLimit {
130    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
131        match self {
132            Self::BlockCount => write!(f, "block count"),
133            Self::ThreadCount => write!(f, "thread count"),
134            Self::Registers => write!(f, "register usage"),
135            Self::SharedMemory => write!(f, "shared memory"),
136            Self::None => write!(f, "none"),
137        }
138    }
139}
140
141/// GPU occupancy calculator.
142pub struct OccupancyCalculator;
143
144impl OccupancyCalculator {
145    /// Calculate occupancy for a given GPU spec and kernel configuration.
146    #[allow(clippy::cast_precision_loss)]
147    #[allow(clippy::manual_checked_ops)]
148    #[must_use]
149    pub fn calculate(spec: &GpuSpec, kernel: &KernelResources) -> OccupancyResult {
150        if kernel.threads_per_block == 0 || spec.warp_size == 0 {
151            return OccupancyResult {
152                occupancy: 0.0,
153                active_warps_per_sm: 0,
154                max_warps_per_sm: 0,
155                active_blocks_per_sm: 0,
156                limiting_factor: OccupancyLimit::ThreadCount,
157            };
158        }
159
160        let max_warps = spec.max_threads_per_sm / spec.warp_size;
161        let warps_per_block = kernel.threads_per_block.div_ceil(spec.warp_size);
162
163        // Limit by block count
164        let blocks_by_count = spec.max_blocks_per_sm;
165
166        // Limit by thread count
167        let blocks_by_threads = max_warps.checked_div(warps_per_block).unwrap_or(0);
168
169        // Limit by registers
170        let blocks_by_registers = if kernel.registers_per_thread > 0 {
171            let regs_per_block = kernel.registers_per_thread * kernel.threads_per_block;
172            spec.max_registers_per_sm
173                .checked_div(regs_per_block)
174                .unwrap_or(blocks_by_count)
175        } else {
176            blocks_by_count
177        };
178
179        // Limit by shared memory
180        let blocks_by_shared = spec
181            .max_shared_memory_per_sm
182            .checked_div(kernel.shared_memory_per_block)
183            .unwrap_or(blocks_by_count);
184
185        // Find limiting factor
186        let active_blocks = blocks_by_count
187            .min(blocks_by_threads)
188            .min(blocks_by_registers)
189            .min(blocks_by_shared);
190
191        let limiting_factor = if active_blocks == 0 {
192            OccupancyLimit::ThreadCount
193        } else if active_blocks == blocks_by_shared && blocks_by_shared < blocks_by_count {
194            OccupancyLimit::SharedMemory
195        } else if active_blocks == blocks_by_registers && blocks_by_registers < blocks_by_count {
196            OccupancyLimit::Registers
197        } else if active_blocks == blocks_by_threads && blocks_by_threads < blocks_by_count {
198            OccupancyLimit::ThreadCount
199        } else if active_blocks == blocks_by_count {
200            OccupancyLimit::BlockCount
201        } else {
202            OccupancyLimit::None
203        };
204
205        let active_warps = active_blocks * warps_per_block;
206        let occupancy = if max_warps > 0 {
207            f64::from(active_warps) / f64::from(max_warps)
208        } else {
209            0.0
210        };
211        let occupancy = occupancy.min(1.0);
212
213        OccupancyResult {
214            occupancy,
215            active_warps_per_sm: active_warps.min(max_warps),
216            max_warps_per_sm: max_warps,
217            active_blocks_per_sm: active_blocks,
218            limiting_factor,
219        }
220    }
221
222    /// Find the optimal workgroup size for best occupancy.
223    ///
224    /// Iterates over multiples of warp size up to max threads per block.
225    #[allow(clippy::cast_precision_loss)]
226    #[must_use]
227    pub fn find_optimal_block_size(
228        spec: &GpuSpec,
229        registers_per_thread: u32,
230        shared_memory_per_block: u32,
231    ) -> u32 {
232        let mut best_occupancy = 0.0_f64;
233        let mut best_size = spec.warp_size;
234
235        let max_threads = spec.max_threads_per_sm.min(1024);
236        let mut threads = spec.warp_size;
237
238        while threads <= max_threads {
239            let kernel =
240                KernelResources::new(threads, registers_per_thread, shared_memory_per_block);
241            let result = Self::calculate(spec, &kernel);
242            if result.occupancy > best_occupancy {
243                best_occupancy = result.occupancy;
244                best_size = threads;
245            }
246            threads += spec.warp_size;
247        }
248
249        best_size
250    }
251
252    /// Estimate achieved memory bandwidth given occupancy.
253    #[allow(clippy::cast_precision_loss)]
254    #[must_use]
255    pub fn estimate_bandwidth(
256        occupancy: f64,
257        peak_bandwidth_gbps: f64,
258        memory_intensity: f64,
259    ) -> f64 {
260        let eff = occupancy.clamp(0.0, 1.0);
261        let intensity = memory_intensity.clamp(0.0, 1.0);
262        peak_bandwidth_gbps * eff * intensity
263    }
264}
265
266/// Performance tip based on occupancy analysis.
267#[derive(Debug, Clone, PartialEq, Eq)]
268pub struct PerformanceTip {
269    /// Human-readable tip.
270    pub message: String,
271    /// Priority (lower is more important).
272    pub priority: u32,
273}
274
275impl PerformanceTip {
276    /// Create a new performance tip.
277    #[must_use]
278    pub fn new(message: &str, priority: u32) -> Self {
279        Self {
280            message: message.to_string(),
281            priority,
282        }
283    }
284}
285
286/// Generate performance tips based on occupancy results.
287#[must_use]
288pub fn analyze_performance(result: &OccupancyResult) -> Vec<PerformanceTip> {
289    let mut tips = Vec::new();
290
291    if result.occupancy < 0.25 {
292        tips.push(PerformanceTip::new(
293            "Very low occupancy. Consider reducing resource usage per thread.",
294            1,
295        ));
296    } else if result.occupancy < 0.5 {
297        tips.push(PerformanceTip::new(
298            "Low occupancy. Adjusting block size or register usage may help.",
299            2,
300        ));
301    }
302
303    match result.limiting_factor {
304        OccupancyLimit::Registers => {
305            tips.push(PerformanceTip::new(
306                "Register usage is the bottleneck. Consider reducing local variables.",
307                2,
308            ));
309        }
310        OccupancyLimit::SharedMemory => {
311            tips.push(PerformanceTip::new(
312                "Shared memory is the bottleneck. Consider reducing shared memory usage.",
313                2,
314            ));
315        }
316        _ => {}
317    }
318
319    if result.occupancy >= 0.75 {
320        tips.push(PerformanceTip::new(
321            "Good occupancy. Focus on memory access patterns and instruction throughput.",
322            3,
323        ));
324    }
325
326    tips
327}
328
329#[cfg(test)]
330mod tests {
331    use super::*;
332
333    #[test]
334    fn test_gpu_spec_mid_range() {
335        let spec = GpuSpec::mid_range();
336        assert_eq!(spec.max_threads_per_sm, 1536);
337        assert_eq!(spec.warp_size, 32);
338    }
339
340    #[test]
341    fn test_gpu_spec_high_end() {
342        let spec = GpuSpec::high_end();
343        assert_eq!(spec.max_threads_per_sm, 2048);
344        assert_eq!(spec.sm_count, 80);
345    }
346
347    #[test]
348    fn test_gpu_spec_integrated() {
349        let spec = GpuSpec::integrated();
350        assert_eq!(spec.max_threads_per_sm, 512);
351        assert_eq!(spec.sm_count, 8);
352    }
353
354    #[test]
355    fn test_kernel_resources_simple() {
356        let k = KernelResources::simple(256);
357        assert_eq!(k.threads_per_block, 256);
358        assert_eq!(k.registers_per_thread, 32);
359        assert_eq!(k.shared_memory_per_block, 0);
360    }
361
362    #[test]
363    fn test_occupancy_simple_kernel() {
364        let spec = GpuSpec::mid_range();
365        let kernel = KernelResources::simple(256);
366        let result = OccupancyCalculator::calculate(&spec, &kernel);
367        assert!(result.occupancy > 0.0);
368        assert!(result.occupancy <= 1.0);
369        assert!(result.active_warps_per_sm > 0);
370    }
371
372    #[test]
373    fn test_occupancy_heavy_registers() {
374        let spec = GpuSpec::mid_range();
375        let kernel = KernelResources::new(256, 128, 0);
376        let result = OccupancyCalculator::calculate(&spec, &kernel);
377        // Heavy register usage should reduce occupancy
378        assert!(result.occupancy <= 1.0);
379    }
380
381    #[test]
382    fn test_occupancy_heavy_shared_memory() {
383        let spec = GpuSpec::mid_range();
384        let kernel = KernelResources::new(256, 32, 32768);
385        let result = OccupancyCalculator::calculate(&spec, &kernel);
386        assert!(result.occupancy > 0.0);
387        assert!(result.limiting_factor == OccupancyLimit::SharedMemory);
388    }
389
390    #[test]
391    fn test_occupancy_zero_threads() {
392        let spec = GpuSpec::mid_range();
393        let kernel = KernelResources::new(0, 32, 0);
394        let result = OccupancyCalculator::calculate(&spec, &kernel);
395        assert!((result.occupancy - 0.0).abs() < f64::EPSILON);
396    }
397
398    #[test]
399    fn test_find_optimal_block_size() {
400        let spec = GpuSpec::mid_range();
401        let optimal = OccupancyCalculator::find_optimal_block_size(&spec, 32, 0);
402        assert!(optimal >= spec.warp_size);
403        assert!(optimal <= spec.max_threads_per_sm);
404        assert_eq!(optimal % spec.warp_size, 0);
405    }
406
407    #[test]
408    fn test_estimate_bandwidth() {
409        let bw = OccupancyCalculator::estimate_bandwidth(1.0, 500.0, 1.0);
410        assert!((bw - 500.0).abs() < f64::EPSILON);
411
412        let bw2 = OccupancyCalculator::estimate_bandwidth(0.5, 500.0, 0.8);
413        assert!((bw2 - 200.0).abs() < f64::EPSILON);
414    }
415
416    #[test]
417    fn test_estimate_bandwidth_clamping() {
418        let bw = OccupancyCalculator::estimate_bandwidth(2.0, 500.0, 1.5);
419        assert!((bw - 500.0).abs() < f64::EPSILON);
420    }
421
422    #[test]
423    fn test_performance_tips_low_occupancy() {
424        let result = OccupancyResult {
425            occupancy: 0.1,
426            active_warps_per_sm: 4,
427            max_warps_per_sm: 48,
428            active_blocks_per_sm: 1,
429            limiting_factor: OccupancyLimit::Registers,
430        };
431        let tips = analyze_performance(&result);
432        assert!(!tips.is_empty());
433        assert!(tips.iter().any(|t| t.message.contains("Very low")));
434    }
435
436    #[test]
437    fn test_performance_tips_good_occupancy() {
438        let result = OccupancyResult {
439            occupancy: 0.8,
440            active_warps_per_sm: 38,
441            max_warps_per_sm: 48,
442            active_blocks_per_sm: 6,
443            limiting_factor: OccupancyLimit::None,
444        };
445        let tips = analyze_performance(&result);
446        assert!(tips.iter().any(|t| t.message.contains("Good occupancy")));
447    }
448
449    #[test]
450    fn test_occupancy_limit_display() {
451        assert_eq!(format!("{}", OccupancyLimit::BlockCount), "block count");
452        assert_eq!(format!("{}", OccupancyLimit::Registers), "register usage");
453        assert_eq!(format!("{}", OccupancyLimit::SharedMemory), "shared memory");
454    }
455
456    #[test]
457    fn test_performance_tip_creation() {
458        let tip = PerformanceTip::new("test tip", 5);
459        assert_eq!(tip.message, "test tip");
460        assert_eq!(tip.priority, 5);
461    }
462}