Skip to main content

trueno_explain/
analyzer.rs

1//! Core Analyzer trait and types for trueno-explain
2//!
3//! Implements the Toyota Way principle of Genchi Genbutsu (Go and See)
4//! by making invisible compiler transformations visible.
5
6use crate::error::Result;
7use serde::{Deserialize, Serialize};
8
9/// Muda (waste) categories mapped to technical inefficiencies
10#[derive(Debug, Clone, Serialize, Deserialize, PartialEq)]
11pub enum MudaType {
12    /// Muda of Transport: Register spills (moving data unnecessarily)
13    Transport,
14    /// Muda of Waiting: Uncoalesced memory access (stalls)
15    Waiting,
16    /// Muda of Overprocessing: Redundant instructions or excessive precision
17    Overprocessing,
18}
19
20/// A warning about detected waste (Muda) in generated code
21#[derive(Debug, Clone, Serialize, Deserialize, PartialEq)]
22pub struct MudaWarning {
23    /// The category of waste detected
24    pub muda_type: MudaType,
25    /// Human-readable description of the issue
26    pub description: String,
27    /// Performance impact of the waste
28    pub impact: String,
29    /// Source line number if available
30    pub line: Option<usize>,
31    /// Suggested fix for the issue
32    pub suggestion: Option<String>,
33}
34
35/// Register usage statistics
36#[derive(Debug, Clone, Default, Serialize, Deserialize, PartialEq)]
37pub struct RegisterUsage {
38    /// Number of 32-bit floating point registers
39    pub f32_regs: u32,
40    /// Number of 64-bit floating point registers
41    pub f64_regs: u32,
42    /// Number of 32-bit integer/bit registers
43    pub b32_regs: u32,
44    /// Number of 64-bit integer/bit registers
45    pub b64_regs: u32,
46    /// Number of predicate (1-bit) registers
47    pub pred_regs: u32,
48}
49
50impl RegisterUsage {
51    /// Total register count
52    #[must_use]
53    pub fn total(&self) -> u32 {
54        self.f32_regs + self.f64_regs + self.b32_regs + self.b64_regs + self.pred_regs
55    }
56
57    /// Estimate occupancy based on register usage (simplified model)
58    /// SM 7.0+: 65536 registers per SM, max 255 per thread
59    #[must_use]
60    pub fn estimated_occupancy(&self) -> f32 {
61        let total = self.total();
62        if total == 0 {
63            return 1.0;
64        }
65        // Simplified: assume 2048 threads max per SM
66        // registers_per_thread * threads <= 65536
67        let max_threads = (65536 / total.max(1)).min(2048);
68        max_threads as f32 / 2048.0
69    }
70}
71
72/// Memory access pattern analysis
73#[derive(Debug, Clone, Default, Serialize, Deserialize, PartialEq)]
74pub struct MemoryPattern {
75    /// Number of global memory load operations
76    pub global_loads: u32,
77    /// Number of global memory store operations
78    pub global_stores: u32,
79    /// Number of shared memory load operations
80    pub shared_loads: u32,
81    /// Number of shared memory store operations
82    pub shared_stores: u32,
83    /// Ratio of coalesced memory accesses (0.0-1.0)
84    pub coalesced_ratio: f32,
85}
86
87/// Roofline model metrics for performance estimation
88#[derive(Debug, Clone, Default, Serialize, Deserialize, PartialEq)]
89pub struct RooflineMetric {
90    /// FLOPs per byte transferred (compute vs memory ratio)
91    pub arithmetic_intensity: f32,
92    /// Theoretical peak compute performance in GFLOPS
93    pub theoretical_peak_gflops: f32,
94    /// True if kernel is memory-bound, false if compute-bound
95    pub memory_bound: bool,
96}
97
98/// Complete analysis report for a kernel or function
99#[derive(Debug, Clone, Default, Serialize, Deserialize)]
100pub struct AnalysisReport {
101    /// Kernel or function name
102    pub name: String,
103    /// Target IR type (PTX, x86 ASM, WGSL)
104    pub target: String,
105    /// Register usage statistics
106    pub registers: RegisterUsage,
107    /// Memory access patterns
108    pub memory: MemoryPattern,
109    /// Roofline performance model
110    pub roofline: RooflineMetric,
111    /// Detected waste (Muda) warnings
112    pub warnings: Vec<MudaWarning>,
113    /// Total instruction count
114    pub instruction_count: u32,
115    /// Estimated GPU occupancy (0.0-1.0)
116    pub estimated_occupancy: f32,
117}
118
119/// Core trait for all analyzers (PTX, SIMD, WGSL)
120pub trait Analyzer {
121    /// The type of IR being analyzed (e.g., "PTX", "x86 ASM", "WGSL")
122    fn target_name(&self) -> &str;
123
124    /// Analyze the provided code and return a structured report
125    ///
126    /// # Errors
127    ///
128    /// Returns `ExplainError::PtxParseError` if the code cannot be parsed.
129    fn analyze(&self, code: &str) -> Result<AnalysisReport>;
130
131    /// Identify specific performance bottlenecks (Muda)
132    fn detect_muda(&self, code: &str) -> Vec<MudaWarning>;
133
134    /// Estimate theoretical peak performance
135    fn estimate_roofline(&self, analysis: &AnalysisReport) -> RooflineMetric;
136}
137
138#[cfg(test)]
139mod tests {
140    use super::*;
141
142    #[test]
143    fn test_register_usage_total() {
144        let usage = RegisterUsage {
145            f32_regs: 10,
146            f64_regs: 5,
147            b32_regs: 8,
148            b64_regs: 4,
149            pred_regs: 2,
150        };
151        assert_eq!(usage.total(), 29);
152    }
153
154    #[test]
155    fn test_register_usage_total_empty() {
156        let usage = RegisterUsage::default();
157        assert_eq!(usage.total(), 0);
158    }
159
160    #[test]
161    fn test_occupancy_low_registers() {
162        let usage = RegisterUsage {
163            f32_regs: 16,
164            ..Default::default()
165        };
166        // 16 registers -> 65536/16 = 4096, capped at 2048 -> 100%
167        assert!((usage.estimated_occupancy() - 1.0).abs() < 0.01);
168    }
169
170    #[test]
171    fn test_occupancy_high_registers() {
172        let usage = RegisterUsage {
173            f32_regs: 128,
174            ..Default::default()
175        };
176        // 128 registers -> 65536/128 = 512 threads -> 512/2048 = 25%
177        assert!((usage.estimated_occupancy() - 0.25).abs() < 0.01);
178    }
179
180    #[test]
181    fn test_occupancy_zero_registers() {
182        let usage = RegisterUsage::default();
183        assert!((usage.estimated_occupancy() - 1.0).abs() < 0.01);
184    }
185
186    #[test]
187    fn test_muda_warning_serialization() {
188        let warning = MudaWarning {
189            muda_type: MudaType::Transport,
190            description: "5 register spills detected".to_string(),
191            impact: "High latency local memory access".to_string(),
192            line: Some(42),
193            suggestion: Some("Reduce live variables".to_string()),
194        };
195
196        let json = serde_json::to_string(&warning).unwrap();
197        let parsed: MudaWarning = serde_json::from_str(&json).unwrap();
198        assert_eq!(warning, parsed);
199    }
200
201    #[test]
202    fn test_analysis_report_serialization() {
203        let report = AnalysisReport {
204            name: "test_kernel".to_string(),
205            target: "PTX".to_string(),
206            registers: RegisterUsage {
207                f32_regs: 24,
208                b32_regs: 18,
209                ..Default::default()
210            },
211            memory: MemoryPattern {
212                global_loads: 100,
213                coalesced_ratio: 0.95,
214                ..Default::default()
215            },
216            warnings: vec![],
217            instruction_count: 150,
218            estimated_occupancy: 0.875,
219            ..Default::default()
220        };
221
222        let json = serde_json::to_string_pretty(&report).unwrap();
223        assert!(json.contains("test_kernel"));
224        assert!(json.contains("PTX"));
225    }
226}