1use serde::{Deserialize, Serialize};
5
6#[derive(Debug, Clone, Serialize, Deserialize)]
8pub enum GpuMuda {
9 Transport {
12 register_spills: u64,
13 unnecessary_global_loads: u64,
14 redundant_shared_stores: u64,
15 },
16
17 Waiting {
20 barrier_stall_cycles: u64,
21 memory_stall_cycles: u64,
22 pipeline_bubbles: u64,
23 warp_scheduler_idle_pct: f64,
24 },
25
26 Overprocessing {
29 precision_waste_pct: f64,
30 redundant_instructions: u64,
31 unnecessary_bounds_checks: u64,
32 },
33
34 Inventory {
37 unused_shared_memory_bytes: u64,
38 unused_registers_per_thread: u32,
39 occupancy_loss_pct: f64,
40 },
41
42 Motion {
45 divergent_branches: u64,
46 branch_efficiency_pct: f64,
47 loop_overhead_cycles: u64,
48 },
49
50 Defects {
53 nan_count: u64,
54 inf_count: u64,
55 precision_loss_bits: f64,
56 },
57
58 Overproduction {
61 padding_waste_pct: f64,
62 inactive_thread_pct: f64,
63 unused_output_elements: u64,
64 },
65}
66
67#[derive(Debug, Clone, Serialize, Deserialize)]
69pub struct MudaDetection {
70 pub muda: GpuMuda,
71 pub impact_pct: f64,
73 pub description: String,
75 pub recommendation: String,
77}
78
79#[derive(Default)]
81pub struct MudaDetector {
82 pub thresholds: MudaThresholds,
84}
85
86#[derive(Debug, Clone, Serialize, Deserialize)]
88pub struct MudaThresholds {
89 pub max_register_spills: u64,
91 pub max_scheduler_idle_pct: f64,
93 pub min_branch_efficiency_pct: f64,
95 pub max_occupancy_loss_pct: f64,
97 pub max_padding_waste_pct: f64,
99 pub max_nan_inf_count: u64,
101 pub max_precision_waste_pct: f64,
103}
104
105impl Default for MudaThresholds {
106 fn default() -> Self {
107 Self {
108 max_register_spills: 0,
109 max_scheduler_idle_pct: 20.0,
110 min_branch_efficiency_pct: 90.0,
111 max_occupancy_loss_pct: 50.0,
112 max_padding_waste_pct: 10.0,
113 max_nan_inf_count: 0,
114 max_precision_waste_pct: 25.0,
115 }
116 }
117}
118
119impl MudaDetector {
120 pub fn new() -> Self {
121 Self {
122 thresholds: MudaThresholds::default(),
123 }
124 }
125
126 pub fn with_thresholds(thresholds: MudaThresholds) -> Self {
127 Self { thresholds }
128 }
129
130 pub fn detect_transport(
132 &self,
133 register_spills: u64,
134 unnecessary_global_loads: u64,
135 redundant_shared_stores: u64,
136 ) -> Option<MudaDetection> {
137 if register_spills > self.thresholds.max_register_spills
138 || unnecessary_global_loads > 0
139 || redundant_shared_stores > 0
140 {
141 let total_waste = register_spills + unnecessary_global_loads + redundant_shared_stores;
142 Some(MudaDetection {
143 muda: GpuMuda::Transport {
144 register_spills,
145 unnecessary_global_loads,
146 redundant_shared_stores,
147 },
148 impact_pct: (total_waste as f64).min(100.0),
149 description: format!(
150 "Data movement waste: {register_spills} register spills, \
151 {unnecessary_global_loads} unnecessary global loads, \
152 {redundant_shared_stores} redundant shared stores"
153 ),
154 recommendation: if register_spills > 0 {
155 "Reduce register pressure: decrease tile size, use shared memory, or reduce live variables".to_string()
156 } else {
157 "Review memory access patterns for redundant loads/stores".to_string()
158 },
159 })
160 } else {
161 None
162 }
163 }
164
165 pub fn detect_waiting(
167 &self,
168 barrier_stall_cycles: u64,
169 memory_stall_cycles: u64,
170 pipeline_bubbles: u64,
171 warp_scheduler_idle_pct: f64,
172 ) -> Option<MudaDetection> {
173 if warp_scheduler_idle_pct > self.thresholds.max_scheduler_idle_pct
174 || barrier_stall_cycles > 0
175 || memory_stall_cycles > 0
176 {
177 let impact =
178 warp_scheduler_idle_pct.max(if memory_stall_cycles > 0 { 10.0 } else { 0.0 });
179 Some(MudaDetection {
180 muda: GpuMuda::Waiting {
181 barrier_stall_cycles,
182 memory_stall_cycles,
183 pipeline_bubbles,
184 warp_scheduler_idle_pct,
185 },
186 impact_pct: impact,
187 description: format!(
188 "Hardware idle: scheduler {warp_scheduler_idle_pct:.1}% idle, \
189 {memory_stall_cycles} memory stall cycles, \
190 {barrier_stall_cycles} barrier stall cycles"
191 ),
192 recommendation: if memory_stall_cycles > barrier_stall_cycles {
193 "Increase warps per SM for latency hiding, or improve data locality".to_string()
194 } else {
195 "Reduce barrier synchronization or overlap compute with data movement"
196 .to_string()
197 },
198 })
199 } else {
200 None
201 }
202 }
203
204 pub fn detect_motion(
206 &self,
207 divergent_branches: u64,
208 branch_efficiency_pct: f64,
209 loop_overhead_cycles: u64,
210 ) -> Option<MudaDetection> {
211 if branch_efficiency_pct < self.thresholds.min_branch_efficiency_pct
212 || divergent_branches > 0
213 {
214 Some(MudaDetection {
215 muda: GpuMuda::Motion {
216 divergent_branches,
217 branch_efficiency_pct,
218 loop_overhead_cycles,
219 },
220 impact_pct: 100.0 - branch_efficiency_pct,
221 description: format!(
222 "Control flow waste: {divergent_branches} divergent branches, \
223 {branch_efficiency_pct:.1}% branch efficiency"
224 ),
225 recommendation:
226 "Ensure warp-uniform branching; move data-dependent branches outside warp"
227 .to_string(),
228 })
229 } else {
230 None
231 }
232 }
233
234 pub fn detect_inventory(
236 &self,
237 unused_shared_memory_bytes: u64,
238 unused_registers_per_thread: u32,
239 occupancy_loss_pct: f64,
240 ) -> Option<MudaDetection> {
241 if occupancy_loss_pct > self.thresholds.max_occupancy_loss_pct
242 || unused_shared_memory_bytes > 0
243 || unused_registers_per_thread > 0
244 {
245 Some(MudaDetection {
246 muda: GpuMuda::Inventory {
247 unused_shared_memory_bytes,
248 unused_registers_per_thread,
249 occupancy_loss_pct,
250 },
251 impact_pct: occupancy_loss_pct,
252 description: format!(
253 "Resource waste: {unused_shared_memory_bytes} bytes unused smem, \
254 {unused_registers_per_thread} unused regs/thread, \
255 {occupancy_loss_pct:.1}% occupancy loss"
256 ),
257 recommendation: "Reduce shared memory or register allocation to improve occupancy"
258 .to_string(),
259 })
260 } else {
261 None
262 }
263 }
264
265 pub fn detect_defects(
267 &self,
268 nan_count: u64,
269 inf_count: u64,
270 precision_loss_bits: f64,
271 ) -> Option<MudaDetection> {
272 if nan_count > self.thresholds.max_nan_inf_count
273 || inf_count > self.thresholds.max_nan_inf_count
274 || precision_loss_bits > 1.0
275 {
276 Some(MudaDetection {
277 muda: GpuMuda::Defects {
278 nan_count,
279 inf_count,
280 precision_loss_bits,
281 },
282 impact_pct: if nan_count > 0 || inf_count > 0 {
283 100.0
284 } else {
285 precision_loss_bits * 10.0
286 },
287 description: format!(
288 "Numerical defects: {nan_count} NaN, {inf_count} Inf, \
289 {precision_loss_bits:.1} bits precision loss"
290 ),
291 recommendation: if nan_count > 0 {
292 "Investigate NaN source: likely division by zero or log(negative)".to_string()
293 } else {
294 "Consider using higher precision for accumulation".to_string()
295 },
296 })
297 } else {
298 None
299 }
300 }
301
302 pub fn detect_overproduction(
304 &self,
305 padding_waste_pct: f64,
306 inactive_thread_pct: f64,
307 unused_output_elements: u64,
308 ) -> Option<MudaDetection> {
309 if padding_waste_pct > self.thresholds.max_padding_waste_pct
310 || inactive_thread_pct > self.thresholds.max_padding_waste_pct
311 {
312 Some(MudaDetection {
313 muda: GpuMuda::Overproduction {
314 padding_waste_pct,
315 inactive_thread_pct,
316 unused_output_elements,
317 },
318 impact_pct: padding_waste_pct.max(inactive_thread_pct),
319 description: format!(
320 "Overproduction: {padding_waste_pct:.1}% padding waste, \
321 {inactive_thread_pct:.1}% inactive threads"
322 ),
323 recommendation: "Adjust tile size to match problem dimensions; use predication for partial tiles".to_string(),
324 })
325 } else {
326 None
327 }
328 }
329}
330
331#[cfg(test)]
332mod tests {
333 use super::*;
334
335 #[test]
336 fn test_detect_register_spills() {
337 let detector = MudaDetector::new();
338 let result = detector.detect_transport(5, 0, 0);
339 assert!(result.is_some());
340 let detection = result.unwrap();
341 assert!(matches!(
342 detection.muda,
343 GpuMuda::Transport {
344 register_spills: 5,
345 ..
346 }
347 ));
348 }
349
350 #[test]
351 fn test_no_transport_waste() {
352 let detector = MudaDetector::new();
353 let result = detector.detect_transport(0, 0, 0);
354 assert!(result.is_none());
355 }
356
357 #[test]
358 fn test_detect_warp_divergence() {
359 let detector = MudaDetector::new();
360 let result = detector.detect_motion(10, 75.0, 100);
361 assert!(result.is_some());
362 let detection = result.unwrap();
363 assert!(matches!(
364 detection.muda,
365 GpuMuda::Motion {
366 divergent_branches: 10,
367 ..
368 }
369 ));
370 }
371
372 #[test]
373 fn test_detect_nan_defects() {
374 let detector = MudaDetector::new();
375 let result = detector.detect_defects(3, 0, 0.0);
376 assert!(result.is_some());
377 assert_eq!(result.unwrap().impact_pct, 100.0);
378 }
379
380 #[test]
381 fn test_no_defects_clean() {
382 let detector = MudaDetector::new();
383 let result = detector.detect_defects(0, 0, 0.5);
384 assert!(result.is_none());
385 }
386
387 #[test]
388 fn test_detect_overproduction() {
389 let detector = MudaDetector::new();
390 let result = detector.detect_overproduction(25.0, 15.0, 1024);
391 assert!(result.is_some());
392 assert_eq!(result.unwrap().impact_pct, 25.0);
393 }
394
395 #[test]
396 fn test_custom_thresholds() {
397 let thresholds = MudaThresholds {
398 max_register_spills: 10,
399 ..Default::default()
400 };
401 let detector = MudaDetector::with_thresholds(thresholds);
402 let result = detector.detect_transport(5, 0, 0);
404 assert!(result.is_none());
405 let result = detector.detect_transport(11, 0, 0);
407 assert!(result.is_some());
408 }
409}