Skip to main content

cgp/profilers/
simd.rs

1//! CPU SIMD profiling via perf stat + renacer + trueno-explain.
2//! Spec section 4.2.
3
4use anyhow::Result;
5use std::collections::HashMap;
6use std::process::Command;
7
8/// perf stat hardware counters for SIMD analysis.
9pub const SIMD_PERF_EVENTS: &[&str] = &[
10    "cycles",
11    "instructions",
12    "cache-references",
13    "cache-misses",
14    "L1-dcache-load-misses",
15    "LLC-loads",
16    "branches",
17    "branch-misses",
18];
19
20/// Architecture-specific perf events for SIMD utilization.
21pub const AVX2_EVENTS: &[&str] = &[
22    "fp_arith_inst_retired.scalar_single",
23    "fp_arith_inst_retired.128b_packed_single",
24    "fp_arith_inst_retired.256b_packed_single",
25];
26
27pub const AVX512_EVENTS: &[&str] = &[
28    "fp_arith_inst_retired.scalar_single",
29    "fp_arith_inst_retired.256b_packed_single",
30    "fp_arith_inst_retired.512b_packed_single",
31];
32
33/// Parsed perf stat output.
34#[derive(Debug, Clone, Default)]
35pub struct PerfStatResult {
36    pub counters: HashMap<String, u64>,
37    pub wall_time_secs: f64,
38}
39
40impl PerfStatResult {
41    /// Compute IPC (instructions per cycle).
42    pub fn ipc(&self) -> f64 {
43        let cycles = *self.counters.get("cycles").unwrap_or(&0) as f64;
44        let instructions = *self.counters.get("instructions").unwrap_or(&0) as f64;
45        if cycles > 0.0 {
46            instructions / cycles
47        } else {
48            0.0
49        }
50    }
51
52    /// Compute cache miss rate.
53    pub fn cache_miss_rate(&self) -> f64 {
54        let refs = *self.counters.get("cache-references").unwrap_or(&0) as f64;
55        let misses = *self.counters.get("cache-misses").unwrap_or(&0) as f64;
56        if refs > 0.0 {
57            misses / refs * 100.0
58        } else {
59            0.0
60        }
61    }
62
63    /// Compute branch misprediction rate.
64    pub fn branch_miss_rate(&self) -> f64 {
65        let branches = *self.counters.get("branches").unwrap_or(&0) as f64;
66        let misses = *self.counters.get("branch-misses").unwrap_or(&0) as f64;
67        if branches > 0.0 {
68            misses / branches * 100.0
69        } else {
70            0.0
71        }
72    }
73
74    /// Compute SIMD utilization: vector_ops / (vector_ops + scalar_ops) * 100.
75    pub fn simd_utilization(&self) -> Option<f64> {
76        let scalar = *self
77            .counters
78            .get("fp_arith_inst_retired.scalar_single")
79            .unwrap_or(&0) as f64;
80        let vec128 = *self
81            .counters
82            .get("fp_arith_inst_retired.128b_packed_single")
83            .unwrap_or(&0) as f64;
84        let vec256 = *self
85            .counters
86            .get("fp_arith_inst_retired.256b_packed_single")
87            .unwrap_or(&0) as f64;
88        let vec512 = *self
89            .counters
90            .get("fp_arith_inst_retired.512b_packed_single")
91            .unwrap_or(&0) as f64;
92
93        let vector = vec128 + vec256 + vec512;
94        let total = scalar + vector;
95        if total > 0.0 {
96            Some(vector / total * 100.0)
97        } else {
98            None
99        }
100    }
101}
102
103/// Run perf stat and parse the output.
104pub fn run_perf_stat(binary: &str, args: &[&str], events: &[&str]) -> Result<PerfStatResult> {
105    let event_str = events.join(",");
106    let mut cmd = Command::new("perf");
107    cmd.arg("stat")
108        .arg("-e")
109        .arg(&event_str)
110        .arg("-x")
111        .arg(",") // CSV separator
112        .arg(binary)
113        .args(args);
114
115    let output = cmd.output()?;
116    let stderr = String::from_utf8_lossy(&output.stderr);
117
118    parse_perf_stat_csv(&stderr)
119}
120
121/// Parse perf stat CSV output (perf writes stats to stderr).
122/// Format: value,unit,event_name,... (with -x ,)
123pub fn parse_perf_stat_csv(output: &str) -> Result<PerfStatResult> {
124    let mut result = PerfStatResult::default();
125
126    for line in output.lines() {
127        let line = line.trim();
128        if line.is_empty() || line.starts_with('#') || line.starts_with("Performance") {
129            continue;
130        }
131
132        // Extract wall time from "X.YZ seconds time elapsed" lines
133        if line.contains("seconds time elapsed") {
134            if let Some(time_str) = line.split_whitespace().next() {
135                if let Ok(t) = time_str.parse::<f64>() {
136                    result.wall_time_secs = t;
137                }
138            }
139            continue;
140        }
141
142        // CSV format: value,unit,event-name,...
143        let fields: Vec<&str> = line.split(',').collect();
144        if fields.len() >= 3 {
145            let value_str = fields[0].trim().replace(' ', "");
146            let event_name = fields[2].trim();
147
148            if let Ok(value) = value_str.parse::<u64>() {
149                result.counters.insert(event_name.to_string(), value);
150            }
151        }
152    }
153
154    Ok(result)
155}
156
157/// Profile a SIMD function.
158pub fn profile_simd(function: &str, size: u32, arch: &str) -> Result<()> {
159    println!("\n=== CGP SIMD Profile: {function} (size={size}, arch={arch}) ===\n");
160
161    let Some(simd_events) = resolve_simd_events(arch)? else {
162        println!();
163        return Ok(());
164    };
165
166    if which::which("perf").is_err() {
167        print_perf_missing(function, arch);
168        println!();
169        return Ok(());
170    }
171
172    let Some(binary) = find_bench_binary() else {
173        println!("  No benchmark binary found.");
174        println!("  Build with: cargo build --release --bench vector_ops");
175        println!("  Then re-run cgp profile simd.");
176        println!();
177        return Ok(());
178    };
179
180    profile_with_perf(&binary, simd_events);
181    println!();
182    Ok(())
183}
184
185/// Return the SIMD perf event list for `arch`, or `None` when the arch is
186/// unsupported on the host (early-exit for caller). Warning printed to stdout.
187fn resolve_simd_events(arch: &str) -> Result<Option<&'static [&'static str]>> {
188    match arch {
189        "avx2" => {
190            #[cfg(target_arch = "x86_64")]
191            {
192                if !std::arch::is_x86_feature_detected!("avx2") {
193                    println!("  Warning: AVX2 not available on this CPU.");
194                }
195            }
196            Ok(Some(AVX2_EVENTS))
197        }
198        "avx512" => {
199            #[cfg(target_arch = "x86_64")]
200            {
201                if !std::arch::is_x86_feature_detected!("avx512f") {
202                    println!("  Warning: AVX-512 not available on this CPU.");
203                }
204            }
205            Ok(Some(AVX512_EVENTS))
206        }
207        "neon" => {
208            #[cfg(not(target_arch = "aarch64"))]
209            {
210                println!("  NEON not available -- use --cross-profile for QEMU-based analysis");
211                Ok(None)
212            }
213            #[cfg(target_arch = "aarch64")]
214            {
215                const NEON_EVENTS: &[&str] = &["INST_RETIRED", "CPU_CYCLES", "ASE_SPEC"];
216                Ok(Some(NEON_EVENTS))
217            }
218        }
219        "sse2" => {
220            const SSE2_EVENTS: &[&str] = &[
221                "fp_arith_inst_retired.scalar_single",
222                "fp_arith_inst_retired.128b_packed_single",
223            ];
224            Ok(Some(SSE2_EVENTS))
225        }
226        _ => {
227            anyhow::bail!("Unknown SIMD architecture: {arch}. Supported: avx2, avx512, neon, sse2")
228        }
229    }
230}
231
232fn print_perf_missing(function: &str, arch: &str) {
233    println!("  perf not found. Install linux-tools-common for hardware counter profiling.");
234    println!("  Showing static analysis only.");
235    println!("\n  Function: {function}");
236    println!("  Architecture: {arch}");
237}
238
239fn profile_with_perf(binary: &str, simd_events: &[&str]) {
240    println!("  Backend: perf stat");
241    println!("  Binary: {binary}");
242
243    let mut all_events: Vec<&str> = SIMD_PERF_EVENTS.to_vec();
244    all_events.extend_from_slice(simd_events);
245
246    match run_perf_stat(binary, &[], &all_events) {
247        Ok(result) => {
248            warn_if_counters_blocked(&result);
249            print_hardware_counters(&result);
250            print_simd_utilization(&result);
251            if result.wall_time_secs > 0.0 {
252                println!("\n  Wall time: {:.3}s", result.wall_time_secs);
253            }
254        }
255        Err(e) => {
256            println!("  perf stat failed: {e}");
257            println!("  Try: sudo sysctl kernel.perf_event_paranoid=2");
258        }
259    }
260}
261
262/// Emit a warning when cycles counter is zero despite a non-empty counter set
263/// (typical sign of blocked perf paranoia).
264fn warn_if_counters_blocked(result: &PerfStatResult) {
265    let cycles = *result.counters.get("cycles").unwrap_or(&0);
266    if cycles != 0 || result.counters.is_empty() {
267        return;
268    }
269    let paranoid = std::fs::read_to_string("/proc/sys/kernel/perf_event_paranoid")
270        .ok()
271        .and_then(|s| s.trim().parse::<i32>().ok())
272        .unwrap_or(-1);
273    if paranoid > 2 {
274        println!(
275            "  \x1b[33m[WARN]\x1b[0m perf_event_paranoid={paranoid} — hardware counters blocked."
276        );
277        println!("  Fix: sudo sysctl kernel.perf_event_paranoid=2");
278        println!("  Or run: sudo cgp profile simd ...\n");
279    }
280}
281
282fn print_hardware_counters(result: &PerfStatResult) {
283    let cycles = *result.counters.get("cycles").unwrap_or(&0);
284    println!("\n  Hardware Counters:");
285    println!("    Cycles:       {:>14}", format_count(cycles));
286    println!(
287        "    Instructions: {:>14}",
288        format_count(*result.counters.get("instructions").unwrap_or(&0))
289    );
290    println!("    IPC:          {:>14.2}", result.ipc());
291    println!("    Cache miss:   {:>13.1}%", result.cache_miss_rate());
292    println!("    Branch miss:  {:>13.1}%", result.branch_miss_rate());
293}
294
295fn print_simd_utilization(result: &PerfStatResult) {
296    let Some(simd_pct) = result.simd_utilization() else {
297        return;
298    };
299    println!("\n  SIMD Utilization:");
300    println!("    Vector ops:    {simd_pct:.1}%");
301    println!("    Scalar ops:    {:.1}%", 100.0 - simd_pct);
302    if simd_pct < 50.0 {
303        println!("    [WARN] Low SIMD utilization — check for scalar fallbacks");
304    } else {
305        println!("    [OK] Good SIMD utilization");
306    }
307}
308
309/// Format a large number with comma separators.
310fn format_count(n: u64) -> String {
311    let s = n.to_string();
312    let mut result = String::new();
313    for (i, c) in s.chars().rev().enumerate() {
314        if i > 0 && i % 3 == 0 {
315            result.push(',');
316        }
317        result.push(c);
318    }
319    result.chars().rev().collect()
320}
321
322/// Find a trueno benchmark binary.
323/// Checks CARGO_TARGET_DIR, standard locations, and glob for bench deps.
324fn find_bench_binary() -> Option<String> {
325    // Check CARGO_TARGET_DIR first (user's zsh function sets this)
326    let target_dir = std::env::var("CARGO_TARGET_DIR").unwrap_or_default();
327
328    let mut candidates: Vec<String> = Vec::new();
329    if !target_dir.is_empty() {
330        candidates.push(format!(
331            "{target_dir}/release/examples/benchmark_matrix_suite"
332        ));
333    }
334    candidates.extend_from_slice(&[
335        "/mnt/nvme-raid0/targets/trueno/release/examples/benchmark_matrix_suite".to_string(),
336        "./target/release/examples/benchmark_matrix_suite".to_string(),
337    ]);
338
339    for path in &candidates {
340        if std::path::Path::new(path).exists() {
341            return Some(path.clone());
342        }
343    }
344
345    // Try glob for bench binaries
346    let glob_dirs = if !target_dir.is_empty() {
347        vec![format!("{target_dir}/release/deps")]
348    } else {
349        vec![
350            "/mnt/nvme-raid0/targets/trueno/release/deps".to_string(),
351            "./target/release/deps".to_string(),
352        ]
353    };
354    for dir in &glob_dirs {
355        if let Ok(entries) = std::fs::read_dir(dir) {
356            for entry in entries.flatten() {
357                let name = entry.file_name();
358                let name_str = name.to_string_lossy();
359                if name_str.starts_with("vector_ops-") && !name_str.contains('.') {
360                    return Some(entry.path().display().to_string());
361                }
362            }
363        }
364    }
365
366    None
367}
368
369#[cfg(test)]
370mod tests {
371    use super::*;
372
373    #[test]
374    fn test_simd_events_defined() {
375        assert!(!SIMD_PERF_EVENTS.is_empty());
376        assert!(!AVX2_EVENTS.is_empty());
377        assert!(!AVX512_EVENTS.is_empty());
378    }
379
380    #[test]
381    fn test_invalid_arch_rejected() {
382        let result = profile_simd("test_fn", 1024, "invalid_arch");
383        assert!(result.is_err());
384    }
385
386    #[test]
387    fn test_parse_perf_stat_csv() {
388        let output = "1234567,,cycles,,,\n456789,,instructions,,,\n100,,cache-references,,,\n5,,cache-misses,,,\n";
389        let result = parse_perf_stat_csv(output).unwrap();
390        assert_eq!(*result.counters.get("cycles").unwrap(), 1234567);
391        assert_eq!(*result.counters.get("instructions").unwrap(), 456789);
392    }
393
394    #[test]
395    fn test_perf_stat_ipc() {
396        let mut result = PerfStatResult::default();
397        result.counters.insert("cycles".to_string(), 1000);
398        result.counters.insert("instructions".to_string(), 2000);
399        assert!((result.ipc() - 2.0).abs() < 0.01);
400    }
401
402    #[test]
403    fn test_perf_stat_cache_miss_rate() {
404        let mut result = PerfStatResult::default();
405        result.counters.insert("cache-references".to_string(), 1000);
406        result.counters.insert("cache-misses".to_string(), 50);
407        assert!((result.cache_miss_rate() - 5.0).abs() < 0.01);
408    }
409
410    #[test]
411    fn test_simd_utilization() {
412        let mut result = PerfStatResult::default();
413        result
414            .counters
415            .insert("fp_arith_inst_retired.scalar_single".to_string(), 100);
416        result
417            .counters
418            .insert("fp_arith_inst_retired.256b_packed_single".to_string(), 900);
419        let util = result.simd_utilization().unwrap();
420        assert!((util - 90.0).abs() < 0.01);
421    }
422
423    #[test]
424    fn test_format_count() {
425        assert_eq!(format_count(0), "0");
426        assert_eq!(format_count(999), "999");
427        assert_eq!(format_count(1000), "1,000");
428        assert_eq!(format_count(1234567), "1,234,567");
429    }
430}