Skip to main content

cgp/profilers/
simd.rs

1//! CPU SIMD profiling via perf stat + renacer + trueno-explain.
2//! Spec section 4.2.
3
4use anyhow::Result;
5use std::collections::HashMap;
6use std::process::Command;
7
8/// perf stat hardware counters for SIMD analysis.
9pub const SIMD_PERF_EVENTS: &[&str] = &[
10    "cycles",
11    "instructions",
12    "cache-references",
13    "cache-misses",
14    "L1-dcache-load-misses",
15    "LLC-loads",
16    "branches",
17    "branch-misses",
18];
19
20/// Architecture-specific perf events for SIMD utilization.
21pub const AVX2_EVENTS: &[&str] = &[
22    "fp_arith_inst_retired.scalar_single",
23    "fp_arith_inst_retired.128b_packed_single",
24    "fp_arith_inst_retired.256b_packed_single",
25];
26
27pub const AVX512_EVENTS: &[&str] = &[
28    "fp_arith_inst_retired.scalar_single",
29    "fp_arith_inst_retired.256b_packed_single",
30    "fp_arith_inst_retired.512b_packed_single",
31];
32
33/// Parsed perf stat output.
34#[derive(Debug, Clone, Default)]
35pub struct PerfStatResult {
36    pub counters: HashMap<String, u64>,
37    pub wall_time_secs: f64,
38}
39
40impl PerfStatResult {
41    /// Compute IPC (instructions per cycle).
42    pub fn ipc(&self) -> f64 {
43        let cycles = *self.counters.get("cycles").unwrap_or(&0) as f64;
44        let instructions = *self.counters.get("instructions").unwrap_or(&0) as f64;
45        if cycles > 0.0 {
46            instructions / cycles
47        } else {
48            0.0
49        }
50    }
51
52    /// Compute cache miss rate.
53    pub fn cache_miss_rate(&self) -> f64 {
54        let refs = *self.counters.get("cache-references").unwrap_or(&0) as f64;
55        let misses = *self.counters.get("cache-misses").unwrap_or(&0) as f64;
56        if refs > 0.0 {
57            misses / refs * 100.0
58        } else {
59            0.0
60        }
61    }
62
63    /// Compute branch misprediction rate.
64    pub fn branch_miss_rate(&self) -> f64 {
65        let branches = *self.counters.get("branches").unwrap_or(&0) as f64;
66        let misses = *self.counters.get("branch-misses").unwrap_or(&0) as f64;
67        if branches > 0.0 {
68            misses / branches * 100.0
69        } else {
70            0.0
71        }
72    }
73
74    /// Compute SIMD utilization: vector_ops / (vector_ops + scalar_ops) * 100.
75    pub fn simd_utilization(&self) -> Option<f64> {
76        let scalar = *self
77            .counters
78            .get("fp_arith_inst_retired.scalar_single")
79            .unwrap_or(&0) as f64;
80        let vec128 = *self
81            .counters
82            .get("fp_arith_inst_retired.128b_packed_single")
83            .unwrap_or(&0) as f64;
84        let vec256 = *self
85            .counters
86            .get("fp_arith_inst_retired.256b_packed_single")
87            .unwrap_or(&0) as f64;
88        let vec512 = *self
89            .counters
90            .get("fp_arith_inst_retired.512b_packed_single")
91            .unwrap_or(&0) as f64;
92
93        let vector = vec128 + vec256 + vec512;
94        let total = scalar + vector;
95        if total > 0.0 {
96            Some(vector / total * 100.0)
97        } else {
98            None
99        }
100    }
101}
102
103/// Run perf stat and parse the output.
104pub fn run_perf_stat(binary: &str, args: &[&str], events: &[&str]) -> Result<PerfStatResult> {
105    let event_str = events.join(",");
106    let mut cmd = Command::new("perf");
107    cmd.arg("stat")
108        .arg("-e")
109        .arg(&event_str)
110        .arg("-x")
111        .arg(",") // CSV separator
112        .arg(binary)
113        .args(args);
114
115    let output = cmd.output()?;
116    let stderr = String::from_utf8_lossy(&output.stderr);
117
118    parse_perf_stat_csv(&stderr)
119}
120
121/// Parse perf stat CSV output (perf writes stats to stderr).
122/// Format: value,unit,event_name,... (with -x ,)
123pub fn parse_perf_stat_csv(output: &str) -> Result<PerfStatResult> {
124    let mut result = PerfStatResult::default();
125
126    for line in output.lines() {
127        let line = line.trim();
128        if line.is_empty() || line.starts_with('#') || line.starts_with("Performance") {
129            continue;
130        }
131
132        // Extract wall time from "X.YZ seconds time elapsed" lines
133        if line.contains("seconds time elapsed") {
134            if let Some(time_str) = line.split_whitespace().next() {
135                if let Ok(t) = time_str.parse::<f64>() {
136                    result.wall_time_secs = t;
137                }
138            }
139            continue;
140        }
141
142        // CSV format: value,unit,event-name,...
143        let fields: Vec<&str> = line.split(',').collect();
144        if fields.len() >= 3 {
145            let value_str = fields[0].trim().replace(' ', "");
146            let event_name = fields[2].trim();
147
148            if let Ok(value) = value_str.parse::<u64>() {
149                result.counters.insert(event_name.to_string(), value);
150            }
151        }
152    }
153
154    Ok(result)
155}
156
157/// Profile a SIMD function.
158pub fn profile_simd(function: &str, size: u32, arch: &str) -> Result<()> {
159    println!("\n=== CGP SIMD Profile: {function} (size={size}, arch={arch}) ===\n");
160
161    // Validate architecture
162    let simd_events: &[&str] = match arch {
163        "avx2" => {
164            #[cfg(target_arch = "x86_64")]
165            {
166                if !std::arch::is_x86_feature_detected!("avx2") {
167                    println!("  Warning: AVX2 not available on this CPU.");
168                }
169            }
170            AVX2_EVENTS
171        }
172        "avx512" => {
173            #[cfg(target_arch = "x86_64")]
174            {
175                if !std::arch::is_x86_feature_detected!("avx512f") {
176                    println!("  Warning: AVX-512 not available on this CPU.");
177                }
178            }
179            AVX512_EVENTS
180        }
181        "neon" => {
182            #[cfg(not(target_arch = "aarch64"))]
183            {
184                println!("  NEON not available -- use --cross-profile for QEMU-based analysis");
185                return Ok(());
186            }
187            #[cfg(target_arch = "aarch64")]
188            {
189                &["INST_RETIRED", "CPU_CYCLES", "ASE_SPEC"][..]
190            }
191        }
192        "sse2" => &[
193            "fp_arith_inst_retired.scalar_single",
194            "fp_arith_inst_retired.128b_packed_single",
195        ][..],
196        _ => {
197            anyhow::bail!("Unknown SIMD architecture: {arch}. Supported: avx2, avx512, neon, sse2")
198        }
199    };
200
201    let has_perf = which::which("perf").is_ok();
202    if !has_perf {
203        println!("  perf not found. Install linux-tools-common for hardware counter profiling.");
204        println!("  Showing static analysis only.");
205        println!("\n  Function: {function}");
206        println!("  Architecture: {arch}");
207        return Ok(());
208    }
209
210    // Try to find a trueno benchmark binary
211    let bin_path = find_bench_binary();
212    match bin_path {
213        Some(binary) => {
214            println!("  Backend: perf stat");
215            println!("  Binary: {binary}");
216
217            // Collect base + SIMD counters
218            let mut all_events: Vec<&str> = SIMD_PERF_EVENTS.to_vec();
219            all_events.extend_from_slice(simd_events);
220
221            match run_perf_stat(&binary, &[], &all_events) {
222                Ok(result) => {
223                    // Check if counters are all zero (paranoid mode)
224                    let cycles = *result.counters.get("cycles").unwrap_or(&0);
225                    if cycles == 0 && !result.counters.is_empty() {
226                        let paranoid =
227                            std::fs::read_to_string("/proc/sys/kernel/perf_event_paranoid")
228                                .ok()
229                                .and_then(|s| s.trim().parse::<i32>().ok())
230                                .unwrap_or(-1);
231                        if paranoid > 2 {
232                            println!("  \x1b[33m[WARN]\x1b[0m perf_event_paranoid={paranoid} — hardware counters blocked.");
233                            println!("  Fix: sudo sysctl kernel.perf_event_paranoid=2");
234                            println!("  Or run: sudo cgp profile simd ...\n");
235                        }
236                    }
237
238                    println!("\n  Hardware Counters:");
239                    println!("    Cycles:       {:>14}", format_count(cycles));
240                    println!(
241                        "    Instructions: {:>14}",
242                        format_count(*result.counters.get("instructions").unwrap_or(&0))
243                    );
244                    println!("    IPC:          {:>14.2}", result.ipc());
245                    println!("    Cache miss:   {:>13.1}%", result.cache_miss_rate());
246                    println!("    Branch miss:  {:>13.1}%", result.branch_miss_rate());
247
248                    if let Some(simd_pct) = result.simd_utilization() {
249                        println!("\n  SIMD Utilization:");
250                        println!("    Vector ops:    {simd_pct:.1}%");
251                        println!("    Scalar ops:    {:.1}%", 100.0 - simd_pct);
252                        if simd_pct < 50.0 {
253                            println!(
254                                "    [WARN] Low SIMD utilization — check for scalar fallbacks"
255                            );
256                        } else {
257                            println!("    [OK] Good SIMD utilization");
258                        }
259                    }
260
261                    if result.wall_time_secs > 0.0 {
262                        println!("\n  Wall time: {:.3}s", result.wall_time_secs);
263                    }
264                }
265                Err(e) => {
266                    println!("  perf stat failed: {e}");
267                    println!("  Try: sudo sysctl kernel.perf_event_paranoid=2");
268                }
269            }
270        }
271        None => {
272            println!("  No benchmark binary found.");
273            println!("  Build with: cargo build --release --bench vector_ops");
274            println!("  Then re-run cgp profile simd.");
275        }
276    }
277
278    println!();
279    Ok(())
280}
281
282/// Format a large number with comma separators.
283fn format_count(n: u64) -> String {
284    let s = n.to_string();
285    let mut result = String::new();
286    for (i, c) in s.chars().rev().enumerate() {
287        if i > 0 && i % 3 == 0 {
288            result.push(',');
289        }
290        result.push(c);
291    }
292    result.chars().rev().collect()
293}
294
295/// Find a trueno benchmark binary.
296/// Checks CARGO_TARGET_DIR, standard locations, and glob for bench deps.
297fn find_bench_binary() -> Option<String> {
298    // Check CARGO_TARGET_DIR first (user's zsh function sets this)
299    let target_dir = std::env::var("CARGO_TARGET_DIR").unwrap_or_default();
300
301    let mut candidates: Vec<String> = Vec::new();
302    if !target_dir.is_empty() {
303        candidates.push(format!(
304            "{target_dir}/release/examples/benchmark_matrix_suite"
305        ));
306    }
307    candidates.extend_from_slice(&[
308        "/mnt/nvme-raid0/targets/trueno/release/examples/benchmark_matrix_suite".to_string(),
309        "./target/release/examples/benchmark_matrix_suite".to_string(),
310    ]);
311
312    for path in &candidates {
313        if std::path::Path::new(path).exists() {
314            return Some(path.clone());
315        }
316    }
317
318    // Try glob for bench binaries
319    let glob_dirs = if !target_dir.is_empty() {
320        vec![format!("{target_dir}/release/deps")]
321    } else {
322        vec![
323            "/mnt/nvme-raid0/targets/trueno/release/deps".to_string(),
324            "./target/release/deps".to_string(),
325        ]
326    };
327    for dir in &glob_dirs {
328        if let Ok(entries) = std::fs::read_dir(dir) {
329            for entry in entries.flatten() {
330                let name = entry.file_name();
331                let name_str = name.to_string_lossy();
332                if name_str.starts_with("vector_ops-") && !name_str.contains('.') {
333                    return Some(entry.path().display().to_string());
334                }
335            }
336        }
337    }
338
339    None
340}
341
342#[cfg(test)]
343mod tests {
344    use super::*;
345
346    #[test]
347    fn test_simd_events_defined() {
348        assert!(!SIMD_PERF_EVENTS.is_empty());
349        assert!(!AVX2_EVENTS.is_empty());
350        assert!(!AVX512_EVENTS.is_empty());
351    }
352
353    #[test]
354    fn test_invalid_arch_rejected() {
355        let result = profile_simd("test_fn", 1024, "invalid_arch");
356        assert!(result.is_err());
357    }
358
359    #[test]
360    fn test_parse_perf_stat_csv() {
361        let output = "1234567,,cycles,,,\n456789,,instructions,,,\n100,,cache-references,,,\n5,,cache-misses,,,\n";
362        let result = parse_perf_stat_csv(output).unwrap();
363        assert_eq!(*result.counters.get("cycles").unwrap(), 1234567);
364        assert_eq!(*result.counters.get("instructions").unwrap(), 456789);
365    }
366
367    #[test]
368    fn test_perf_stat_ipc() {
369        let mut result = PerfStatResult::default();
370        result.counters.insert("cycles".to_string(), 1000);
371        result.counters.insert("instructions".to_string(), 2000);
372        assert!((result.ipc() - 2.0).abs() < 0.01);
373    }
374
375    #[test]
376    fn test_perf_stat_cache_miss_rate() {
377        let mut result = PerfStatResult::default();
378        result.counters.insert("cache-references".to_string(), 1000);
379        result.counters.insert("cache-misses".to_string(), 50);
380        assert!((result.cache_miss_rate() - 5.0).abs() < 0.01);
381    }
382
383    #[test]
384    fn test_simd_utilization() {
385        let mut result = PerfStatResult::default();
386        result
387            .counters
388            .insert("fp_arith_inst_retired.scalar_single".to_string(), 100);
389        result
390            .counters
391            .insert("fp_arith_inst_retired.256b_packed_single".to_string(), 900);
392        let util = result.simd_utilization().unwrap();
393        assert!((util - 90.0).abs() < 0.01);
394    }
395
396    #[test]
397    fn test_format_count() {
398        assert_eq!(format_count(0), "0");
399        assert_eq!(format_count(999), "999");
400        assert_eq!(format_count(1000), "1,000");
401        assert_eq!(format_count(1234567), "1,234,567");
402    }
403}