1use anyhow::Result;
5use std::collections::HashMap;
6use std::process::Command;
7
8pub const SIMD_PERF_EVENTS: &[&str] = &[
10 "cycles",
11 "instructions",
12 "cache-references",
13 "cache-misses",
14 "L1-dcache-load-misses",
15 "LLC-loads",
16 "branches",
17 "branch-misses",
18];
19
20pub const AVX2_EVENTS: &[&str] = &[
22 "fp_arith_inst_retired.scalar_single",
23 "fp_arith_inst_retired.128b_packed_single",
24 "fp_arith_inst_retired.256b_packed_single",
25];
26
27pub const AVX512_EVENTS: &[&str] = &[
28 "fp_arith_inst_retired.scalar_single",
29 "fp_arith_inst_retired.256b_packed_single",
30 "fp_arith_inst_retired.512b_packed_single",
31];
32
33#[derive(Debug, Clone, Default)]
35pub struct PerfStatResult {
36 pub counters: HashMap<String, u64>,
37 pub wall_time_secs: f64,
38}
39
40impl PerfStatResult {
41 pub fn ipc(&self) -> f64 {
43 let cycles = *self.counters.get("cycles").unwrap_or(&0) as f64;
44 let instructions = *self.counters.get("instructions").unwrap_or(&0) as f64;
45 if cycles > 0.0 {
46 instructions / cycles
47 } else {
48 0.0
49 }
50 }
51
52 pub fn cache_miss_rate(&self) -> f64 {
54 let refs = *self.counters.get("cache-references").unwrap_or(&0) as f64;
55 let misses = *self.counters.get("cache-misses").unwrap_or(&0) as f64;
56 if refs > 0.0 {
57 misses / refs * 100.0
58 } else {
59 0.0
60 }
61 }
62
63 pub fn branch_miss_rate(&self) -> f64 {
65 let branches = *self.counters.get("branches").unwrap_or(&0) as f64;
66 let misses = *self.counters.get("branch-misses").unwrap_or(&0) as f64;
67 if branches > 0.0 {
68 misses / branches * 100.0
69 } else {
70 0.0
71 }
72 }
73
74 pub fn simd_utilization(&self) -> Option<f64> {
76 let scalar = *self
77 .counters
78 .get("fp_arith_inst_retired.scalar_single")
79 .unwrap_or(&0) as f64;
80 let vec128 = *self
81 .counters
82 .get("fp_arith_inst_retired.128b_packed_single")
83 .unwrap_or(&0) as f64;
84 let vec256 = *self
85 .counters
86 .get("fp_arith_inst_retired.256b_packed_single")
87 .unwrap_or(&0) as f64;
88 let vec512 = *self
89 .counters
90 .get("fp_arith_inst_retired.512b_packed_single")
91 .unwrap_or(&0) as f64;
92
93 let vector = vec128 + vec256 + vec512;
94 let total = scalar + vector;
95 if total > 0.0 {
96 Some(vector / total * 100.0)
97 } else {
98 None
99 }
100 }
101}
102
103pub fn run_perf_stat(binary: &str, args: &[&str], events: &[&str]) -> Result<PerfStatResult> {
105 let event_str = events.join(",");
106 let mut cmd = Command::new("perf");
107 cmd.arg("stat")
108 .arg("-e")
109 .arg(&event_str)
110 .arg("-x")
111 .arg(",") .arg(binary)
113 .args(args);
114
115 let output = cmd.output()?;
116 let stderr = String::from_utf8_lossy(&output.stderr);
117
118 parse_perf_stat_csv(&stderr)
119}
120
121pub fn parse_perf_stat_csv(output: &str) -> Result<PerfStatResult> {
124 let mut result = PerfStatResult::default();
125
126 for line in output.lines() {
127 let line = line.trim();
128 if line.is_empty() || line.starts_with('#') || line.starts_with("Performance") {
129 continue;
130 }
131
132 if line.contains("seconds time elapsed") {
134 if let Some(time_str) = line.split_whitespace().next() {
135 if let Ok(t) = time_str.parse::<f64>() {
136 result.wall_time_secs = t;
137 }
138 }
139 continue;
140 }
141
142 let fields: Vec<&str> = line.split(',').collect();
144 if fields.len() >= 3 {
145 let value_str = fields[0].trim().replace(' ', "");
146 let event_name = fields[2].trim();
147
148 if let Ok(value) = value_str.parse::<u64>() {
149 result.counters.insert(event_name.to_string(), value);
150 }
151 }
152 }
153
154 Ok(result)
155}
156
157pub fn profile_simd(function: &str, size: u32, arch: &str) -> Result<()> {
159 println!("\n=== CGP SIMD Profile: {function} (size={size}, arch={arch}) ===\n");
160
161 let Some(simd_events) = resolve_simd_events(arch)? else {
162 println!();
163 return Ok(());
164 };
165
166 if which::which("perf").is_err() {
167 print_perf_missing(function, arch);
168 println!();
169 return Ok(());
170 }
171
172 let Some(binary) = find_bench_binary() else {
173 println!(" No benchmark binary found.");
174 println!(" Build with: cargo build --release --bench vector_ops");
175 println!(" Then re-run cgp profile simd.");
176 println!();
177 return Ok(());
178 };
179
180 profile_with_perf(&binary, simd_events);
181 println!();
182 Ok(())
183}
184
185fn resolve_simd_events(arch: &str) -> Result<Option<&'static [&'static str]>> {
188 match arch {
189 "avx2" => {
190 #[cfg(target_arch = "x86_64")]
191 {
192 if !std::arch::is_x86_feature_detected!("avx2") {
193 println!(" Warning: AVX2 not available on this CPU.");
194 }
195 }
196 Ok(Some(AVX2_EVENTS))
197 }
198 "avx512" => {
199 #[cfg(target_arch = "x86_64")]
200 {
201 if !std::arch::is_x86_feature_detected!("avx512f") {
202 println!(" Warning: AVX-512 not available on this CPU.");
203 }
204 }
205 Ok(Some(AVX512_EVENTS))
206 }
207 "neon" => {
208 #[cfg(not(target_arch = "aarch64"))]
209 {
210 println!(" NEON not available -- use --cross-profile for QEMU-based analysis");
211 Ok(None)
212 }
213 #[cfg(target_arch = "aarch64")]
214 {
215 const NEON_EVENTS: &[&str] = &["INST_RETIRED", "CPU_CYCLES", "ASE_SPEC"];
216 Ok(Some(NEON_EVENTS))
217 }
218 }
219 "sse2" => {
220 const SSE2_EVENTS: &[&str] = &[
221 "fp_arith_inst_retired.scalar_single",
222 "fp_arith_inst_retired.128b_packed_single",
223 ];
224 Ok(Some(SSE2_EVENTS))
225 }
226 _ => {
227 anyhow::bail!("Unknown SIMD architecture: {arch}. Supported: avx2, avx512, neon, sse2")
228 }
229 }
230}
231
232fn print_perf_missing(function: &str, arch: &str) {
233 println!(" perf not found. Install linux-tools-common for hardware counter profiling.");
234 println!(" Showing static analysis only.");
235 println!("\n Function: {function}");
236 println!(" Architecture: {arch}");
237}
238
239fn profile_with_perf(binary: &str, simd_events: &[&str]) {
240 println!(" Backend: perf stat");
241 println!(" Binary: {binary}");
242
243 let mut all_events: Vec<&str> = SIMD_PERF_EVENTS.to_vec();
244 all_events.extend_from_slice(simd_events);
245
246 match run_perf_stat(binary, &[], &all_events) {
247 Ok(result) => {
248 warn_if_counters_blocked(&result);
249 print_hardware_counters(&result);
250 print_simd_utilization(&result);
251 if result.wall_time_secs > 0.0 {
252 println!("\n Wall time: {:.3}s", result.wall_time_secs);
253 }
254 }
255 Err(e) => {
256 println!(" perf stat failed: {e}");
257 println!(" Try: sudo sysctl kernel.perf_event_paranoid=2");
258 }
259 }
260}
261
262fn warn_if_counters_blocked(result: &PerfStatResult) {
265 let cycles = *result.counters.get("cycles").unwrap_or(&0);
266 if cycles != 0 || result.counters.is_empty() {
267 return;
268 }
269 let paranoid = std::fs::read_to_string("/proc/sys/kernel/perf_event_paranoid")
270 .ok()
271 .and_then(|s| s.trim().parse::<i32>().ok())
272 .unwrap_or(-1);
273 if paranoid > 2 {
274 println!(
275 " \x1b[33m[WARN]\x1b[0m perf_event_paranoid={paranoid} — hardware counters blocked."
276 );
277 println!(" Fix: sudo sysctl kernel.perf_event_paranoid=2");
278 println!(" Or run: sudo cgp profile simd ...\n");
279 }
280}
281
282fn print_hardware_counters(result: &PerfStatResult) {
283 let cycles = *result.counters.get("cycles").unwrap_or(&0);
284 println!("\n Hardware Counters:");
285 println!(" Cycles: {:>14}", format_count(cycles));
286 println!(
287 " Instructions: {:>14}",
288 format_count(*result.counters.get("instructions").unwrap_or(&0))
289 );
290 println!(" IPC: {:>14.2}", result.ipc());
291 println!(" Cache miss: {:>13.1}%", result.cache_miss_rate());
292 println!(" Branch miss: {:>13.1}%", result.branch_miss_rate());
293}
294
295fn print_simd_utilization(result: &PerfStatResult) {
296 let Some(simd_pct) = result.simd_utilization() else {
297 return;
298 };
299 println!("\n SIMD Utilization:");
300 println!(" Vector ops: {simd_pct:.1}%");
301 println!(" Scalar ops: {:.1}%", 100.0 - simd_pct);
302 if simd_pct < 50.0 {
303 println!(" [WARN] Low SIMD utilization — check for scalar fallbacks");
304 } else {
305 println!(" [OK] Good SIMD utilization");
306 }
307}
308
309fn format_count(n: u64) -> String {
311 let s = n.to_string();
312 let mut result = String::new();
313 for (i, c) in s.chars().rev().enumerate() {
314 if i > 0 && i % 3 == 0 {
315 result.push(',');
316 }
317 result.push(c);
318 }
319 result.chars().rev().collect()
320}
321
322fn find_bench_binary() -> Option<String> {
325 let target_dir = std::env::var("CARGO_TARGET_DIR").unwrap_or_default();
327
328 let mut candidates: Vec<String> = Vec::new();
329 if !target_dir.is_empty() {
330 candidates.push(format!(
331 "{target_dir}/release/examples/benchmark_matrix_suite"
332 ));
333 }
334 candidates.extend_from_slice(&[
335 "/mnt/nvme-raid0/targets/trueno/release/examples/benchmark_matrix_suite".to_string(),
336 "./target/release/examples/benchmark_matrix_suite".to_string(),
337 ]);
338
339 for path in &candidates {
340 if std::path::Path::new(path).exists() {
341 return Some(path.clone());
342 }
343 }
344
345 let glob_dirs = if !target_dir.is_empty() {
347 vec![format!("{target_dir}/release/deps")]
348 } else {
349 vec![
350 "/mnt/nvme-raid0/targets/trueno/release/deps".to_string(),
351 "./target/release/deps".to_string(),
352 ]
353 };
354 for dir in &glob_dirs {
355 if let Ok(entries) = std::fs::read_dir(dir) {
356 for entry in entries.flatten() {
357 let name = entry.file_name();
358 let name_str = name.to_string_lossy();
359 if name_str.starts_with("vector_ops-") && !name_str.contains('.') {
360 return Some(entry.path().display().to_string());
361 }
362 }
363 }
364 }
365
366 None
367}
368
369#[cfg(test)]
370mod tests {
371 use super::*;
372
373 #[test]
374 fn test_simd_events_defined() {
375 assert!(!SIMD_PERF_EVENTS.is_empty());
376 assert!(!AVX2_EVENTS.is_empty());
377 assert!(!AVX512_EVENTS.is_empty());
378 }
379
380 #[test]
381 fn test_invalid_arch_rejected() {
382 let result = profile_simd("test_fn", 1024, "invalid_arch");
383 assert!(result.is_err());
384 }
385
386 #[test]
387 fn test_parse_perf_stat_csv() {
388 let output = "1234567,,cycles,,,\n456789,,instructions,,,\n100,,cache-references,,,\n5,,cache-misses,,,\n";
389 let result = parse_perf_stat_csv(output).unwrap();
390 assert_eq!(*result.counters.get("cycles").unwrap(), 1234567);
391 assert_eq!(*result.counters.get("instructions").unwrap(), 456789);
392 }
393
394 #[test]
395 fn test_perf_stat_ipc() {
396 let mut result = PerfStatResult::default();
397 result.counters.insert("cycles".to_string(), 1000);
398 result.counters.insert("instructions".to_string(), 2000);
399 assert!((result.ipc() - 2.0).abs() < 0.01);
400 }
401
402 #[test]
403 fn test_perf_stat_cache_miss_rate() {
404 let mut result = PerfStatResult::default();
405 result.counters.insert("cache-references".to_string(), 1000);
406 result.counters.insert("cache-misses".to_string(), 50);
407 assert!((result.cache_miss_rate() - 5.0).abs() < 0.01);
408 }
409
410 #[test]
411 fn test_simd_utilization() {
412 let mut result = PerfStatResult::default();
413 result
414 .counters
415 .insert("fp_arith_inst_retired.scalar_single".to_string(), 100);
416 result
417 .counters
418 .insert("fp_arith_inst_retired.256b_packed_single".to_string(), 900);
419 let util = result.simd_utilization().unwrap();
420 assert!((util - 90.0).abs() < 0.01);
421 }
422
423 #[test]
424 fn test_format_count() {
425 assert_eq!(format_count(0), "0");
426 assert_eq!(format_count(999), "999");
427 assert_eq!(format_count(1000), "1,000");
428 assert_eq!(format_count(1234567), "1,234,567");
429 }
430}