Skip to main content

oximedia_gpu/
occupancy.rs

1#![allow(dead_code)]
2//! GPU occupancy calculator and optimization.
3//!
4//! This module provides tools for estimating GPU occupancy based on
5//! workgroup size, register usage, and shared memory consumption.
6//! Higher occupancy generally means better latency hiding.
7
8/// Typical GPU multiprocessor specification.
9#[derive(Debug, Clone, PartialEq)]
10pub struct GpuSpec {
11    /// Maximum threads per multiprocessor.
12    pub max_threads_per_sm: u32,
13    /// Maximum workgroups (blocks) per multiprocessor.
14    pub max_blocks_per_sm: u32,
15    /// Maximum shared memory per multiprocessor in bytes.
16    pub max_shared_memory_per_sm: u32,
17    /// Maximum registers per multiprocessor.
18    pub max_registers_per_sm: u32,
19    /// Warp size (wavefront size).
20    pub warp_size: u32,
21    /// Total number of multiprocessors on the device.
22    pub sm_count: u32,
23}
24
25impl GpuSpec {
26    /// Create a spec representing a mid-range desktop GPU.
27    #[must_use]
28    pub fn mid_range() -> Self {
29        Self {
30            max_threads_per_sm: 1536,
31            max_blocks_per_sm: 16,
32            max_shared_memory_per_sm: 49152,
33            max_registers_per_sm: 65536,
34            warp_size: 32,
35            sm_count: 30,
36        }
37    }
38
39    /// Create a spec representing a high-end desktop GPU.
40    #[must_use]
41    pub fn high_end() -> Self {
42        Self {
43            max_threads_per_sm: 2048,
44            max_blocks_per_sm: 32,
45            max_shared_memory_per_sm: 102400,
46            max_registers_per_sm: 65536,
47            warp_size: 32,
48            sm_count: 80,
49        }
50    }
51
52    /// Create a spec representing integrated/mobile GPU.
53    #[must_use]
54    pub fn integrated() -> Self {
55        Self {
56            max_threads_per_sm: 512,
57            max_blocks_per_sm: 8,
58            max_shared_memory_per_sm: 32768,
59            max_registers_per_sm: 32768,
60            warp_size: 32,
61            sm_count: 8,
62        }
63    }
64}
65
66/// Kernel resource requirements.
67#[derive(Debug, Clone, PartialEq)]
68pub struct KernelResources {
69    /// Threads per workgroup (block).
70    pub threads_per_block: u32,
71    /// Registers used per thread.
72    pub registers_per_thread: u32,
73    /// Shared memory used per block in bytes.
74    pub shared_memory_per_block: u32,
75}
76
77impl KernelResources {
78    /// Create a new kernel resource descriptor.
79    #[must_use]
80    pub fn new(threads: u32, registers: u32, shared_mem: u32) -> Self {
81        Self {
82            threads_per_block: threads,
83            registers_per_thread: registers,
84            shared_memory_per_block: shared_mem,
85        }
86    }
87
88    /// Create a simple kernel with typical register usage.
89    #[must_use]
90    pub fn simple(threads: u32) -> Self {
91        Self {
92            threads_per_block: threads,
93            registers_per_thread: 32,
94            shared_memory_per_block: 0,
95        }
96    }
97}
98
99/// Result of occupancy calculation.
100#[derive(Debug, Clone, PartialEq)]
101pub struct OccupancyResult {
102    /// Achieved occupancy ratio (0.0 to 1.0).
103    pub occupancy: f64,
104    /// Active warps per SM.
105    pub active_warps_per_sm: u32,
106    /// Maximum possible warps per SM.
107    pub max_warps_per_sm: u32,
108    /// Active blocks per SM.
109    pub active_blocks_per_sm: u32,
110    /// Limiting factor for occupancy.
111    pub limiting_factor: OccupancyLimit,
112}
113
114/// The factor that limits GPU occupancy.
115#[derive(Debug, Clone, Copy, PartialEq, Eq)]
116pub enum OccupancyLimit {
117    /// Limited by max blocks per SM.
118    BlockCount,
119    /// Limited by thread count.
120    ThreadCount,
121    /// Limited by register usage.
122    Registers,
123    /// Limited by shared memory.
124    SharedMemory,
125    /// No limiting factor (full occupancy).
126    None,
127}
128
129impl std::fmt::Display for OccupancyLimit {
130    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
131        match self {
132            Self::BlockCount => write!(f, "block count"),
133            Self::ThreadCount => write!(f, "thread count"),
134            Self::Registers => write!(f, "register usage"),
135            Self::SharedMemory => write!(f, "shared memory"),
136            Self::None => write!(f, "none"),
137        }
138    }
139}
140
141/// GPU occupancy calculator.
142pub struct OccupancyCalculator;
143
144impl OccupancyCalculator {
145    /// Calculate occupancy for a given GPU spec and kernel configuration.
146    #[allow(clippy::cast_precision_loss)]
147    #[must_use]
148    pub fn calculate(spec: &GpuSpec, kernel: &KernelResources) -> OccupancyResult {
149        if kernel.threads_per_block == 0 || spec.warp_size == 0 {
150            return OccupancyResult {
151                occupancy: 0.0,
152                active_warps_per_sm: 0,
153                max_warps_per_sm: 0,
154                active_blocks_per_sm: 0,
155                limiting_factor: OccupancyLimit::ThreadCount,
156            };
157        }
158
159        let max_warps = spec.max_threads_per_sm / spec.warp_size;
160        let warps_per_block = kernel.threads_per_block.div_ceil(spec.warp_size);
161
162        // Limit by block count
163        let blocks_by_count = spec.max_blocks_per_sm;
164
165        // Limit by thread count
166        let blocks_by_threads = if warps_per_block > 0 {
167            max_warps / warps_per_block
168        } else {
169            0
170        };
171
172        // Limit by registers
173        let blocks_by_registers = if kernel.registers_per_thread > 0 {
174            let regs_per_block = kernel.registers_per_thread * kernel.threads_per_block;
175            if regs_per_block > 0 {
176                spec.max_registers_per_sm / regs_per_block
177            } else {
178                blocks_by_count
179            }
180        } else {
181            blocks_by_count
182        };
183
184        // Limit by shared memory
185        let blocks_by_shared = if kernel.shared_memory_per_block > 0 {
186            spec.max_shared_memory_per_sm / kernel.shared_memory_per_block
187        } else {
188            blocks_by_count
189        };
190
191        // Find limiting factor
192        let active_blocks = blocks_by_count
193            .min(blocks_by_threads)
194            .min(blocks_by_registers)
195            .min(blocks_by_shared);
196
197        let limiting_factor = if active_blocks == 0 {
198            OccupancyLimit::ThreadCount
199        } else if active_blocks == blocks_by_shared && blocks_by_shared < blocks_by_count {
200            OccupancyLimit::SharedMemory
201        } else if active_blocks == blocks_by_registers && blocks_by_registers < blocks_by_count {
202            OccupancyLimit::Registers
203        } else if active_blocks == blocks_by_threads && blocks_by_threads < blocks_by_count {
204            OccupancyLimit::ThreadCount
205        } else if active_blocks == blocks_by_count {
206            OccupancyLimit::BlockCount
207        } else {
208            OccupancyLimit::None
209        };
210
211        let active_warps = active_blocks * warps_per_block;
212        let occupancy = if max_warps > 0 {
213            f64::from(active_warps) / f64::from(max_warps)
214        } else {
215            0.0
216        };
217        let occupancy = occupancy.min(1.0);
218
219        OccupancyResult {
220            occupancy,
221            active_warps_per_sm: active_warps.min(max_warps),
222            max_warps_per_sm: max_warps,
223            active_blocks_per_sm: active_blocks,
224            limiting_factor,
225        }
226    }
227
228    /// Find the optimal workgroup size for best occupancy.
229    ///
230    /// Iterates over multiples of warp size up to max threads per block.
231    #[allow(clippy::cast_precision_loss)]
232    #[must_use]
233    pub fn find_optimal_block_size(
234        spec: &GpuSpec,
235        registers_per_thread: u32,
236        shared_memory_per_block: u32,
237    ) -> u32 {
238        let mut best_occupancy = 0.0_f64;
239        let mut best_size = spec.warp_size;
240
241        let max_threads = spec.max_threads_per_sm.min(1024);
242        let mut threads = spec.warp_size;
243
244        while threads <= max_threads {
245            let kernel =
246                KernelResources::new(threads, registers_per_thread, shared_memory_per_block);
247            let result = Self::calculate(spec, &kernel);
248            if result.occupancy > best_occupancy {
249                best_occupancy = result.occupancy;
250                best_size = threads;
251            }
252            threads += spec.warp_size;
253        }
254
255        best_size
256    }
257
258    /// Estimate achieved memory bandwidth given occupancy.
259    #[allow(clippy::cast_precision_loss)]
260    #[must_use]
261    pub fn estimate_bandwidth(
262        occupancy: f64,
263        peak_bandwidth_gbps: f64,
264        memory_intensity: f64,
265    ) -> f64 {
266        let eff = occupancy.clamp(0.0, 1.0);
267        let intensity = memory_intensity.clamp(0.0, 1.0);
268        peak_bandwidth_gbps * eff * intensity
269    }
270}
271
272/// Performance tip based on occupancy analysis.
273#[derive(Debug, Clone, PartialEq, Eq)]
274pub struct PerformanceTip {
275    /// Human-readable tip.
276    pub message: String,
277    /// Priority (lower is more important).
278    pub priority: u32,
279}
280
281impl PerformanceTip {
282    /// Create a new performance tip.
283    #[must_use]
284    pub fn new(message: &str, priority: u32) -> Self {
285        Self {
286            message: message.to_string(),
287            priority,
288        }
289    }
290}
291
292/// Generate performance tips based on occupancy results.
293#[must_use]
294pub fn analyze_performance(result: &OccupancyResult) -> Vec<PerformanceTip> {
295    let mut tips = Vec::new();
296
297    if result.occupancy < 0.25 {
298        tips.push(PerformanceTip::new(
299            "Very low occupancy. Consider reducing resource usage per thread.",
300            1,
301        ));
302    } else if result.occupancy < 0.5 {
303        tips.push(PerformanceTip::new(
304            "Low occupancy. Adjusting block size or register usage may help.",
305            2,
306        ));
307    }
308
309    match result.limiting_factor {
310        OccupancyLimit::Registers => {
311            tips.push(PerformanceTip::new(
312                "Register usage is the bottleneck. Consider reducing local variables.",
313                2,
314            ));
315        }
316        OccupancyLimit::SharedMemory => {
317            tips.push(PerformanceTip::new(
318                "Shared memory is the bottleneck. Consider reducing shared memory usage.",
319                2,
320            ));
321        }
322        _ => {}
323    }
324
325    if result.occupancy >= 0.75 {
326        tips.push(PerformanceTip::new(
327            "Good occupancy. Focus on memory access patterns and instruction throughput.",
328            3,
329        ));
330    }
331
332    tips
333}
334
335#[cfg(test)]
336mod tests {
337    use super::*;
338
339    #[test]
340    fn test_gpu_spec_mid_range() {
341        let spec = GpuSpec::mid_range();
342        assert_eq!(spec.max_threads_per_sm, 1536);
343        assert_eq!(spec.warp_size, 32);
344    }
345
346    #[test]
347    fn test_gpu_spec_high_end() {
348        let spec = GpuSpec::high_end();
349        assert_eq!(spec.max_threads_per_sm, 2048);
350        assert_eq!(spec.sm_count, 80);
351    }
352
353    #[test]
354    fn test_gpu_spec_integrated() {
355        let spec = GpuSpec::integrated();
356        assert_eq!(spec.max_threads_per_sm, 512);
357        assert_eq!(spec.sm_count, 8);
358    }
359
360    #[test]
361    fn test_kernel_resources_simple() {
362        let k = KernelResources::simple(256);
363        assert_eq!(k.threads_per_block, 256);
364        assert_eq!(k.registers_per_thread, 32);
365        assert_eq!(k.shared_memory_per_block, 0);
366    }
367
368    #[test]
369    fn test_occupancy_simple_kernel() {
370        let spec = GpuSpec::mid_range();
371        let kernel = KernelResources::simple(256);
372        let result = OccupancyCalculator::calculate(&spec, &kernel);
373        assert!(result.occupancy > 0.0);
374        assert!(result.occupancy <= 1.0);
375        assert!(result.active_warps_per_sm > 0);
376    }
377
378    #[test]
379    fn test_occupancy_heavy_registers() {
380        let spec = GpuSpec::mid_range();
381        let kernel = KernelResources::new(256, 128, 0);
382        let result = OccupancyCalculator::calculate(&spec, &kernel);
383        // Heavy register usage should reduce occupancy
384        assert!(result.occupancy <= 1.0);
385    }
386
387    #[test]
388    fn test_occupancy_heavy_shared_memory() {
389        let spec = GpuSpec::mid_range();
390        let kernel = KernelResources::new(256, 32, 32768);
391        let result = OccupancyCalculator::calculate(&spec, &kernel);
392        assert!(result.occupancy > 0.0);
393        assert!(result.limiting_factor == OccupancyLimit::SharedMemory);
394    }
395
396    #[test]
397    fn test_occupancy_zero_threads() {
398        let spec = GpuSpec::mid_range();
399        let kernel = KernelResources::new(0, 32, 0);
400        let result = OccupancyCalculator::calculate(&spec, &kernel);
401        assert!((result.occupancy - 0.0).abs() < f64::EPSILON);
402    }
403
404    #[test]
405    fn test_find_optimal_block_size() {
406        let spec = GpuSpec::mid_range();
407        let optimal = OccupancyCalculator::find_optimal_block_size(&spec, 32, 0);
408        assert!(optimal >= spec.warp_size);
409        assert!(optimal <= spec.max_threads_per_sm);
410        assert_eq!(optimal % spec.warp_size, 0);
411    }
412
413    #[test]
414    fn test_estimate_bandwidth() {
415        let bw = OccupancyCalculator::estimate_bandwidth(1.0, 500.0, 1.0);
416        assert!((bw - 500.0).abs() < f64::EPSILON);
417
418        let bw2 = OccupancyCalculator::estimate_bandwidth(0.5, 500.0, 0.8);
419        assert!((bw2 - 200.0).abs() < f64::EPSILON);
420    }
421
422    #[test]
423    fn test_estimate_bandwidth_clamping() {
424        let bw = OccupancyCalculator::estimate_bandwidth(2.0, 500.0, 1.5);
425        assert!((bw - 500.0).abs() < f64::EPSILON);
426    }
427
428    #[test]
429    fn test_performance_tips_low_occupancy() {
430        let result = OccupancyResult {
431            occupancy: 0.1,
432            active_warps_per_sm: 4,
433            max_warps_per_sm: 48,
434            active_blocks_per_sm: 1,
435            limiting_factor: OccupancyLimit::Registers,
436        };
437        let tips = analyze_performance(&result);
438        assert!(!tips.is_empty());
439        assert!(tips.iter().any(|t| t.message.contains("Very low")));
440    }
441
442    #[test]
443    fn test_performance_tips_good_occupancy() {
444        let result = OccupancyResult {
445            occupancy: 0.8,
446            active_warps_per_sm: 38,
447            max_warps_per_sm: 48,
448            active_blocks_per_sm: 6,
449            limiting_factor: OccupancyLimit::None,
450        };
451        let tips = analyze_performance(&result);
452        assert!(tips.iter().any(|t| t.message.contains("Good occupancy")));
453    }
454
455    #[test]
456    fn test_occupancy_limit_display() {
457        assert_eq!(format!("{}", OccupancyLimit::BlockCount), "block count");
458        assert_eq!(format!("{}", OccupancyLimit::Registers), "register usage");
459        assert_eq!(format!("{}", OccupancyLimit::SharedMemory), "shared memory");
460    }
461
462    #[test]
463    fn test_performance_tip_creation() {
464        let tip = PerformanceTip::new("test tip", 5);
465        assert_eq!(tip.message, "test tip");
466        assert_eq!(tip.priority, 5);
467    }
468}