Skip to main content

cgp/analysis/
muda.rs

1//! Muda (Waste) Detection Engine — Seven categories of GPU compute waste.
2//! Mapped from Toyota Production System (Ohno, 1988) [7].
3
4use serde::{Deserialize, Serialize};
5
6/// Seven Muda of GPU Compute, plus CPU-specific waste categories.
7#[derive(Debug, Clone, Serialize, Deserialize)]
8pub enum GpuMuda {
9    /// Muda of Transport: Data moved unnecessarily.
10    /// Examples: register spills, redundant L2 traffic, unnecessary H2D copies.
11    Transport {
12        register_spills: u64,
13        unnecessary_global_loads: u64,
14        redundant_shared_stores: u64,
15    },
16
17    /// Muda of Waiting: Hardware resources idle.
18    /// Examples: barrier stalls, memory latency not hidden, pipeline bubbles.
19    Waiting {
20        barrier_stall_cycles: u64,
21        memory_stall_cycles: u64,
22        pipeline_bubbles: u64,
23        warp_scheduler_idle_pct: f64,
24    },
25
26    /// Muda of Overprocessing: More work than necessary.
27    /// Examples: FP32 when FP16 suffices, unneeded boundary checks, redundant instructions.
28    Overprocessing {
29        precision_waste_pct: f64,
30        redundant_instructions: u64,
31        unnecessary_bounds_checks: u64,
32    },
33
34    /// Muda of Inventory: Resources allocated but unused.
35    /// Examples: shared memory allocated but not used, registers reserved but unused.
36    Inventory {
37        unused_shared_memory_bytes: u64,
38        unused_registers_per_thread: u32,
39        occupancy_loss_pct: f64,
40    },
41
42    /// Muda of Motion: Excessive control flow.
43    /// Examples: warp divergence, branch overhead, loop overhead.
44    Motion {
45        divergent_branches: u64,
46        branch_efficiency_pct: f64,
47        loop_overhead_cycles: u64,
48    },
49
50    /// Muda of Defects: Incorrect results requiring rework.
51    /// Examples: NaN propagation, precision loss, numerical instability.
52    Defects {
53        nan_count: u64,
54        inf_count: u64,
55        precision_loss_bits: f64,
56    },
57
58    /// Muda of Overproduction: Computing results that aren't needed.
59    /// Examples: padding waste, inactive threads in partial tiles.
60    Overproduction {
61        padding_waste_pct: f64,
62        inactive_thread_pct: f64,
63        unused_output_elements: u64,
64    },
65}
66
67/// Detected waste with severity and recommendation.
68#[derive(Debug, Clone, Serialize, Deserialize)]
69pub struct MudaDetection {
70    pub muda: GpuMuda,
71    /// Impact: estimated percentage of execution time wasted.
72    pub impact_pct: f64,
73    /// Human-readable description of the waste.
74    pub description: String,
75    /// Actionable recommendation.
76    pub recommendation: String,
77}
78
79/// Analyze a kernel profile for all seven Muda categories.
80#[derive(Default)]
81pub struct MudaDetector {
82    /// Thresholds for each waste category.
83    pub thresholds: MudaThresholds,
84}
85
86/// Configurable thresholds for Muda detection.
87#[derive(Debug, Clone, Serialize, Deserialize)]
88pub struct MudaThresholds {
89    /// Register spills > this count triggers Transport muda.
90    pub max_register_spills: u64,
91    /// Warp scheduler idle > this percentage triggers Waiting muda.
92    pub max_scheduler_idle_pct: f64,
93    /// Branch efficiency < this percentage triggers Motion muda.
94    pub min_branch_efficiency_pct: f64,
95    /// Occupancy loss > this percentage triggers Inventory muda.
96    pub max_occupancy_loss_pct: f64,
97    /// Padding waste > this percentage triggers Overproduction muda.
98    pub max_padding_waste_pct: f64,
99    /// Any NaN/Inf triggers Defects muda.
100    pub max_nan_inf_count: u64,
101    /// Precision waste > this percentage triggers Overprocessing muda.
102    pub max_precision_waste_pct: f64,
103}
104
105impl Default for MudaThresholds {
106    fn default() -> Self {
107        Self {
108            max_register_spills: 0,
109            max_scheduler_idle_pct: 20.0,
110            min_branch_efficiency_pct: 90.0,
111            max_occupancy_loss_pct: 50.0,
112            max_padding_waste_pct: 10.0,
113            max_nan_inf_count: 0,
114            max_precision_waste_pct: 25.0,
115        }
116    }
117}
118
119impl MudaDetector {
120    pub fn new() -> Self {
121        Self {
122            thresholds: MudaThresholds::default(),
123        }
124    }
125
126    pub fn with_thresholds(thresholds: MudaThresholds) -> Self {
127        Self { thresholds }
128    }
129
130    /// Detect Transport muda from register spill and memory traffic data.
131    pub fn detect_transport(
132        &self,
133        register_spills: u64,
134        unnecessary_global_loads: u64,
135        redundant_shared_stores: u64,
136    ) -> Option<MudaDetection> {
137        if register_spills > self.thresholds.max_register_spills
138            || unnecessary_global_loads > 0
139            || redundant_shared_stores > 0
140        {
141            let total_waste = register_spills + unnecessary_global_loads + redundant_shared_stores;
142            Some(MudaDetection {
143                muda: GpuMuda::Transport {
144                    register_spills,
145                    unnecessary_global_loads,
146                    redundant_shared_stores,
147                },
148                impact_pct: (total_waste as f64).min(100.0),
149                description: format!(
150                    "Data movement waste: {register_spills} register spills, \
151                     {unnecessary_global_loads} unnecessary global loads, \
152                     {redundant_shared_stores} redundant shared stores"
153                ),
154                recommendation: if register_spills > 0 {
155                    "Reduce register pressure: decrease tile size, use shared memory, or reduce live variables".to_string()
156                } else {
157                    "Review memory access patterns for redundant loads/stores".to_string()
158                },
159            })
160        } else {
161            None
162        }
163    }
164
165    /// Detect Waiting muda from stall cycle data.
166    pub fn detect_waiting(
167        &self,
168        barrier_stall_cycles: u64,
169        memory_stall_cycles: u64,
170        pipeline_bubbles: u64,
171        warp_scheduler_idle_pct: f64,
172    ) -> Option<MudaDetection> {
173        if warp_scheduler_idle_pct > self.thresholds.max_scheduler_idle_pct
174            || barrier_stall_cycles > 0
175            || memory_stall_cycles > 0
176        {
177            let impact =
178                warp_scheduler_idle_pct.max(if memory_stall_cycles > 0 { 10.0 } else { 0.0 });
179            Some(MudaDetection {
180                muda: GpuMuda::Waiting {
181                    barrier_stall_cycles,
182                    memory_stall_cycles,
183                    pipeline_bubbles,
184                    warp_scheduler_idle_pct,
185                },
186                impact_pct: impact,
187                description: format!(
188                    "Hardware idle: scheduler {warp_scheduler_idle_pct:.1}% idle, \
189                     {memory_stall_cycles} memory stall cycles, \
190                     {barrier_stall_cycles} barrier stall cycles"
191                ),
192                recommendation: if memory_stall_cycles > barrier_stall_cycles {
193                    "Increase warps per SM for latency hiding, or improve data locality".to_string()
194                } else {
195                    "Reduce barrier synchronization or overlap compute with data movement"
196                        .to_string()
197                },
198            })
199        } else {
200            None
201        }
202    }
203
204    /// Detect Motion muda from branch divergence data.
205    pub fn detect_motion(
206        &self,
207        divergent_branches: u64,
208        branch_efficiency_pct: f64,
209        loop_overhead_cycles: u64,
210    ) -> Option<MudaDetection> {
211        if branch_efficiency_pct < self.thresholds.min_branch_efficiency_pct
212            || divergent_branches > 0
213        {
214            Some(MudaDetection {
215                muda: GpuMuda::Motion {
216                    divergent_branches,
217                    branch_efficiency_pct,
218                    loop_overhead_cycles,
219                },
220                impact_pct: 100.0 - branch_efficiency_pct,
221                description: format!(
222                    "Control flow waste: {divergent_branches} divergent branches, \
223                     {branch_efficiency_pct:.1}% branch efficiency"
224                ),
225                recommendation:
226                    "Ensure warp-uniform branching; move data-dependent branches outside warp"
227                        .to_string(),
228            })
229        } else {
230            None
231        }
232    }
233
234    /// Detect Inventory muda from resource allocation data.
235    pub fn detect_inventory(
236        &self,
237        unused_shared_memory_bytes: u64,
238        unused_registers_per_thread: u32,
239        occupancy_loss_pct: f64,
240    ) -> Option<MudaDetection> {
241        if occupancy_loss_pct > self.thresholds.max_occupancy_loss_pct
242            || unused_shared_memory_bytes > 0
243            || unused_registers_per_thread > 0
244        {
245            Some(MudaDetection {
246                muda: GpuMuda::Inventory {
247                    unused_shared_memory_bytes,
248                    unused_registers_per_thread,
249                    occupancy_loss_pct,
250                },
251                impact_pct: occupancy_loss_pct,
252                description: format!(
253                    "Resource waste: {unused_shared_memory_bytes} bytes unused smem, \
254                     {unused_registers_per_thread} unused regs/thread, \
255                     {occupancy_loss_pct:.1}% occupancy loss"
256                ),
257                recommendation: "Reduce shared memory or register allocation to improve occupancy"
258                    .to_string(),
259            })
260        } else {
261            None
262        }
263    }
264
265    /// Detect Defects muda from numerical error data.
266    pub fn detect_defects(
267        &self,
268        nan_count: u64,
269        inf_count: u64,
270        precision_loss_bits: f64,
271    ) -> Option<MudaDetection> {
272        if nan_count > self.thresholds.max_nan_inf_count
273            || inf_count > self.thresholds.max_nan_inf_count
274            || precision_loss_bits > 1.0
275        {
276            Some(MudaDetection {
277                muda: GpuMuda::Defects {
278                    nan_count,
279                    inf_count,
280                    precision_loss_bits,
281                },
282                impact_pct: if nan_count > 0 || inf_count > 0 {
283                    100.0
284                } else {
285                    precision_loss_bits * 10.0
286                },
287                description: format!(
288                    "Numerical defects: {nan_count} NaN, {inf_count} Inf, \
289                     {precision_loss_bits:.1} bits precision loss"
290                ),
291                recommendation: if nan_count > 0 {
292                    "Investigate NaN source: likely division by zero or log(negative)".to_string()
293                } else {
294                    "Consider using higher precision for accumulation".to_string()
295                },
296            })
297        } else {
298            None
299        }
300    }
301
302    /// Detect Overproduction muda from padding/inactive thread data.
303    pub fn detect_overproduction(
304        &self,
305        padding_waste_pct: f64,
306        inactive_thread_pct: f64,
307        unused_output_elements: u64,
308    ) -> Option<MudaDetection> {
309        if padding_waste_pct > self.thresholds.max_padding_waste_pct
310            || inactive_thread_pct > self.thresholds.max_padding_waste_pct
311        {
312            Some(MudaDetection {
313                muda: GpuMuda::Overproduction {
314                    padding_waste_pct,
315                    inactive_thread_pct,
316                    unused_output_elements,
317                },
318                impact_pct: padding_waste_pct.max(inactive_thread_pct),
319                description: format!(
320                    "Overproduction: {padding_waste_pct:.1}% padding waste, \
321                     {inactive_thread_pct:.1}% inactive threads"
322                ),
323                recommendation: "Adjust tile size to match problem dimensions; use predication for partial tiles".to_string(),
324            })
325        } else {
326            None
327        }
328    }
329}
330
331#[cfg(test)]
332mod tests {
333    use super::*;
334
335    #[test]
336    fn test_detect_register_spills() {
337        let detector = MudaDetector::new();
338        let result = detector.detect_transport(5, 0, 0);
339        assert!(result.is_some());
340        let detection = result.unwrap();
341        assert!(matches!(
342            detection.muda,
343            GpuMuda::Transport {
344                register_spills: 5,
345                ..
346            }
347        ));
348    }
349
350    #[test]
351    fn test_no_transport_waste() {
352        let detector = MudaDetector::new();
353        let result = detector.detect_transport(0, 0, 0);
354        assert!(result.is_none());
355    }
356
357    #[test]
358    fn test_detect_warp_divergence() {
359        let detector = MudaDetector::new();
360        let result = detector.detect_motion(10, 75.0, 100);
361        assert!(result.is_some());
362        let detection = result.unwrap();
363        assert!(matches!(
364            detection.muda,
365            GpuMuda::Motion {
366                divergent_branches: 10,
367                ..
368            }
369        ));
370    }
371
372    #[test]
373    fn test_detect_nan_defects() {
374        let detector = MudaDetector::new();
375        let result = detector.detect_defects(3, 0, 0.0);
376        assert!(result.is_some());
377        assert_eq!(result.unwrap().impact_pct, 100.0);
378    }
379
380    #[test]
381    fn test_no_defects_clean() {
382        let detector = MudaDetector::new();
383        let result = detector.detect_defects(0, 0, 0.5);
384        assert!(result.is_none());
385    }
386
387    #[test]
388    fn test_detect_overproduction() {
389        let detector = MudaDetector::new();
390        let result = detector.detect_overproduction(25.0, 15.0, 1024);
391        assert!(result.is_some());
392        assert_eq!(result.unwrap().impact_pct, 25.0);
393    }
394
395    #[test]
396    fn test_custom_thresholds() {
397        let thresholds = MudaThresholds {
398            max_register_spills: 10,
399            ..Default::default()
400        };
401        let detector = MudaDetector::with_thresholds(thresholds);
402        // 5 spills should NOT trigger with threshold 10
403        let result = detector.detect_transport(5, 0, 0);
404        assert!(result.is_none());
405        // 11 spills should trigger
406        let result = detector.detect_transport(11, 0, 0);
407        assert!(result.is_some());
408    }
409}