1use anyhow::Result;
5use std::collections::HashMap;
6use std::process::Command;
7
8pub const SIMD_PERF_EVENTS: &[&str] = &[
10 "cycles",
11 "instructions",
12 "cache-references",
13 "cache-misses",
14 "L1-dcache-load-misses",
15 "LLC-loads",
16 "branches",
17 "branch-misses",
18];
19
20pub const AVX2_EVENTS: &[&str] = &[
22 "fp_arith_inst_retired.scalar_single",
23 "fp_arith_inst_retired.128b_packed_single",
24 "fp_arith_inst_retired.256b_packed_single",
25];
26
27pub const AVX512_EVENTS: &[&str] = &[
28 "fp_arith_inst_retired.scalar_single",
29 "fp_arith_inst_retired.256b_packed_single",
30 "fp_arith_inst_retired.512b_packed_single",
31];
32
33#[derive(Debug, Clone, Default)]
35pub struct PerfStatResult {
36 pub counters: HashMap<String, u64>,
37 pub wall_time_secs: f64,
38}
39
40impl PerfStatResult {
41 pub fn ipc(&self) -> f64 {
43 let cycles = *self.counters.get("cycles").unwrap_or(&0) as f64;
44 let instructions = *self.counters.get("instructions").unwrap_or(&0) as f64;
45 if cycles > 0.0 {
46 instructions / cycles
47 } else {
48 0.0
49 }
50 }
51
52 pub fn cache_miss_rate(&self) -> f64 {
54 let refs = *self.counters.get("cache-references").unwrap_or(&0) as f64;
55 let misses = *self.counters.get("cache-misses").unwrap_or(&0) as f64;
56 if refs > 0.0 {
57 misses / refs * 100.0
58 } else {
59 0.0
60 }
61 }
62
63 pub fn branch_miss_rate(&self) -> f64 {
65 let branches = *self.counters.get("branches").unwrap_or(&0) as f64;
66 let misses = *self.counters.get("branch-misses").unwrap_or(&0) as f64;
67 if branches > 0.0 {
68 misses / branches * 100.0
69 } else {
70 0.0
71 }
72 }
73
74 pub fn simd_utilization(&self) -> Option<f64> {
76 let scalar = *self
77 .counters
78 .get("fp_arith_inst_retired.scalar_single")
79 .unwrap_or(&0) as f64;
80 let vec128 = *self
81 .counters
82 .get("fp_arith_inst_retired.128b_packed_single")
83 .unwrap_or(&0) as f64;
84 let vec256 = *self
85 .counters
86 .get("fp_arith_inst_retired.256b_packed_single")
87 .unwrap_or(&0) as f64;
88 let vec512 = *self
89 .counters
90 .get("fp_arith_inst_retired.512b_packed_single")
91 .unwrap_or(&0) as f64;
92
93 let vector = vec128 + vec256 + vec512;
94 let total = scalar + vector;
95 if total > 0.0 {
96 Some(vector / total * 100.0)
97 } else {
98 None
99 }
100 }
101}
102
103pub fn run_perf_stat(binary: &str, args: &[&str], events: &[&str]) -> Result<PerfStatResult> {
105 let event_str = events.join(",");
106 let mut cmd = Command::new("perf");
107 cmd.arg("stat")
108 .arg("-e")
109 .arg(&event_str)
110 .arg("-x")
111 .arg(",") .arg(binary)
113 .args(args);
114
115 let output = cmd.output()?;
116 let stderr = String::from_utf8_lossy(&output.stderr);
117
118 parse_perf_stat_csv(&stderr)
119}
120
121pub fn parse_perf_stat_csv(output: &str) -> Result<PerfStatResult> {
124 let mut result = PerfStatResult::default();
125
126 for line in output.lines() {
127 let line = line.trim();
128 if line.is_empty() || line.starts_with('#') || line.starts_with("Performance") {
129 continue;
130 }
131
132 if line.contains("seconds time elapsed") {
134 if let Some(time_str) = line.split_whitespace().next() {
135 if let Ok(t) = time_str.parse::<f64>() {
136 result.wall_time_secs = t;
137 }
138 }
139 continue;
140 }
141
142 let fields: Vec<&str> = line.split(',').collect();
144 if fields.len() >= 3 {
145 let value_str = fields[0].trim().replace(' ', "");
146 let event_name = fields[2].trim();
147
148 if let Ok(value) = value_str.parse::<u64>() {
149 result.counters.insert(event_name.to_string(), value);
150 }
151 }
152 }
153
154 Ok(result)
155}
156
157pub fn profile_simd(function: &str, size: u32, arch: &str) -> Result<()> {
159 println!("\n=== CGP SIMD Profile: {function} (size={size}, arch={arch}) ===\n");
160
161 let simd_events: &[&str] = match arch {
163 "avx2" => {
164 #[cfg(target_arch = "x86_64")]
165 {
166 if !std::arch::is_x86_feature_detected!("avx2") {
167 println!(" Warning: AVX2 not available on this CPU.");
168 }
169 }
170 AVX2_EVENTS
171 }
172 "avx512" => {
173 #[cfg(target_arch = "x86_64")]
174 {
175 if !std::arch::is_x86_feature_detected!("avx512f") {
176 println!(" Warning: AVX-512 not available on this CPU.");
177 }
178 }
179 AVX512_EVENTS
180 }
181 "neon" => {
182 #[cfg(not(target_arch = "aarch64"))]
183 {
184 println!(" NEON not available -- use --cross-profile for QEMU-based analysis");
185 return Ok(());
186 }
187 #[cfg(target_arch = "aarch64")]
188 {
189 &["INST_RETIRED", "CPU_CYCLES", "ASE_SPEC"][..]
190 }
191 }
192 "sse2" => &[
193 "fp_arith_inst_retired.scalar_single",
194 "fp_arith_inst_retired.128b_packed_single",
195 ][..],
196 _ => {
197 anyhow::bail!("Unknown SIMD architecture: {arch}. Supported: avx2, avx512, neon, sse2")
198 }
199 };
200
201 let has_perf = which::which("perf").is_ok();
202 if !has_perf {
203 println!(" perf not found. Install linux-tools-common for hardware counter profiling.");
204 println!(" Showing static analysis only.");
205 println!("\n Function: {function}");
206 println!(" Architecture: {arch}");
207 return Ok(());
208 }
209
210 let bin_path = find_bench_binary();
212 match bin_path {
213 Some(binary) => {
214 println!(" Backend: perf stat");
215 println!(" Binary: {binary}");
216
217 let mut all_events: Vec<&str> = SIMD_PERF_EVENTS.to_vec();
219 all_events.extend_from_slice(simd_events);
220
221 match run_perf_stat(&binary, &[], &all_events) {
222 Ok(result) => {
223 let cycles = *result.counters.get("cycles").unwrap_or(&0);
225 if cycles == 0 && !result.counters.is_empty() {
226 let paranoid =
227 std::fs::read_to_string("/proc/sys/kernel/perf_event_paranoid")
228 .ok()
229 .and_then(|s| s.trim().parse::<i32>().ok())
230 .unwrap_or(-1);
231 if paranoid > 2 {
232 println!(" \x1b[33m[WARN]\x1b[0m perf_event_paranoid={paranoid} — hardware counters blocked.");
233 println!(" Fix: sudo sysctl kernel.perf_event_paranoid=2");
234 println!(" Or run: sudo cgp profile simd ...\n");
235 }
236 }
237
238 println!("\n Hardware Counters:");
239 println!(" Cycles: {:>14}", format_count(cycles));
240 println!(
241 " Instructions: {:>14}",
242 format_count(*result.counters.get("instructions").unwrap_or(&0))
243 );
244 println!(" IPC: {:>14.2}", result.ipc());
245 println!(" Cache miss: {:>13.1}%", result.cache_miss_rate());
246 println!(" Branch miss: {:>13.1}%", result.branch_miss_rate());
247
248 if let Some(simd_pct) = result.simd_utilization() {
249 println!("\n SIMD Utilization:");
250 println!(" Vector ops: {simd_pct:.1}%");
251 println!(" Scalar ops: {:.1}%", 100.0 - simd_pct);
252 if simd_pct < 50.0 {
253 println!(
254 " [WARN] Low SIMD utilization — check for scalar fallbacks"
255 );
256 } else {
257 println!(" [OK] Good SIMD utilization");
258 }
259 }
260
261 if result.wall_time_secs > 0.0 {
262 println!("\n Wall time: {:.3}s", result.wall_time_secs);
263 }
264 }
265 Err(e) => {
266 println!(" perf stat failed: {e}");
267 println!(" Try: sudo sysctl kernel.perf_event_paranoid=2");
268 }
269 }
270 }
271 None => {
272 println!(" No benchmark binary found.");
273 println!(" Build with: cargo build --release --bench vector_ops");
274 println!(" Then re-run cgp profile simd.");
275 }
276 }
277
278 println!();
279 Ok(())
280}
281
282fn format_count(n: u64) -> String {
284 let s = n.to_string();
285 let mut result = String::new();
286 for (i, c) in s.chars().rev().enumerate() {
287 if i > 0 && i % 3 == 0 {
288 result.push(',');
289 }
290 result.push(c);
291 }
292 result.chars().rev().collect()
293}
294
295fn find_bench_binary() -> Option<String> {
298 let target_dir = std::env::var("CARGO_TARGET_DIR").unwrap_or_default();
300
301 let mut candidates: Vec<String> = Vec::new();
302 if !target_dir.is_empty() {
303 candidates.push(format!(
304 "{target_dir}/release/examples/benchmark_matrix_suite"
305 ));
306 }
307 candidates.extend_from_slice(&[
308 "/mnt/nvme-raid0/targets/trueno/release/examples/benchmark_matrix_suite".to_string(),
309 "./target/release/examples/benchmark_matrix_suite".to_string(),
310 ]);
311
312 for path in &candidates {
313 if std::path::Path::new(path).exists() {
314 return Some(path.clone());
315 }
316 }
317
318 let glob_dirs = if !target_dir.is_empty() {
320 vec![format!("{target_dir}/release/deps")]
321 } else {
322 vec![
323 "/mnt/nvme-raid0/targets/trueno/release/deps".to_string(),
324 "./target/release/deps".to_string(),
325 ]
326 };
327 for dir in &glob_dirs {
328 if let Ok(entries) = std::fs::read_dir(dir) {
329 for entry in entries.flatten() {
330 let name = entry.file_name();
331 let name_str = name.to_string_lossy();
332 if name_str.starts_with("vector_ops-") && !name_str.contains('.') {
333 return Some(entry.path().display().to_string());
334 }
335 }
336 }
337 }
338
339 None
340}
341
342#[cfg(test)]
343mod tests {
344 use super::*;
345
346 #[test]
347 fn test_simd_events_defined() {
348 assert!(!SIMD_PERF_EVENTS.is_empty());
349 assert!(!AVX2_EVENTS.is_empty());
350 assert!(!AVX512_EVENTS.is_empty());
351 }
352
353 #[test]
354 fn test_invalid_arch_rejected() {
355 let result = profile_simd("test_fn", 1024, "invalid_arch");
356 assert!(result.is_err());
357 }
358
359 #[test]
360 fn test_parse_perf_stat_csv() {
361 let output = "1234567,,cycles,,,\n456789,,instructions,,,\n100,,cache-references,,,\n5,,cache-misses,,,\n";
362 let result = parse_perf_stat_csv(output).unwrap();
363 assert_eq!(*result.counters.get("cycles").unwrap(), 1234567);
364 assert_eq!(*result.counters.get("instructions").unwrap(), 456789);
365 }
366
367 #[test]
368 fn test_perf_stat_ipc() {
369 let mut result = PerfStatResult::default();
370 result.counters.insert("cycles".to_string(), 1000);
371 result.counters.insert("instructions".to_string(), 2000);
372 assert!((result.ipc() - 2.0).abs() < 0.01);
373 }
374
375 #[test]
376 fn test_perf_stat_cache_miss_rate() {
377 let mut result = PerfStatResult::default();
378 result.counters.insert("cache-references".to_string(), 1000);
379 result.counters.insert("cache-misses".to_string(), 50);
380 assert!((result.cache_miss_rate() - 5.0).abs() < 0.01);
381 }
382
383 #[test]
384 fn test_simd_utilization() {
385 let mut result = PerfStatResult::default();
386 result
387 .counters
388 .insert("fp_arith_inst_retired.scalar_single".to_string(), 100);
389 result
390 .counters
391 .insert("fp_arith_inst_retired.256b_packed_single".to_string(), 900);
392 let util = result.simd_utilization().unwrap();
393 assert!((util - 90.0).abs() < 0.01);
394 }
395
396 #[test]
397 fn test_format_count() {
398 assert_eq!(format_count(0), "0");
399 assert_eq!(format_count(999), "999");
400 assert_eq!(format_count(1000), "1,000");
401 assert_eq!(format_count(1234567), "1,234,567");
402 }
403}